In [None]:
import tensorflow as tf
import numpy as np


tf.enable_eager_execution()

In [None]:
from matplotlib import pyplot as plt

def positional_encoding(shape,debug=False):
    #shape =input.shape# tf.TensorShape(data.shape) #tf.shape(data) 
    dim_sequence = tf.cast(shape[-2], tf.float32)
    dim_embedding = tf.cast(shape[-1]/2, tf.float32)
    normfactor = dim_sequence/15
    dim_mesh, seq_mesh = tf.meshgrid(tf.range(0, dim_embedding), tf.range(0, dim_sequence))
    
    phase = tf.pow(tf.cast(seq_mesh, tf.float32)/normfactor, 2*tf.cast(dim_mesh,tf.float32)/dim_embedding)
    
    stacked = tf.concat([tf.sin(phase), tf.cos(phase)], axis = 1)
    if debug:
        print("stacked.shape", stacked.shape)
        #print("shape", shape)
        print("transposed image")
        plt.imshow(tf.transpose(stacked), interpolation='nearest')
        plt.show()
    
    stacked = tf.reshape(stacked, (1,shape[1],shape[2])) # tf.Dimension(None)
#    return tf.convert_to_tensor(stacked, dtype=np.float32)
#     print(shape[0])
    return tf.tile(stacked, (shape[0],1,1))

tf.constant(np.ones([32,40,58]))
inp=tf.keras.Input((40,58), batch_size=32)

v=positional_encoding(tf.TensorShape(inp.shape), True)

print(v,v.shape)

In [None]:

def preprocess_attention(input, debug=False):
    if debug:
        print("preprocess_attention")
    # q,k,v (steps, attention_dim * query_count)
    query_count = 10
    attention_dim=input.shape[-1]
    head_dim=attention_dim
    
    # create queries, keys and values
    # attention_input = tf.keras.layers.Dense(units=(1+1+2)*attention_dim*query_count, activation=tf.nn.relu)(input)

    k = tf.keras.layers.Dense(attention_dim*query_count)(input)
    q = tf.keras.layers.Dense(attention_dim*query_count)(input)
    v = tf.keras.layers.Dense(2*attention_dim*query_count)(input)
    
#     q = attention_input[:,:,attention_dim*query_count:2*attention_dim*query_count]
#     v = attention_input[:,:,2*attention_dim*query_count:]

    if debug:
        print("k,q,v",k.shape.as_list(),q.shape.as_list(),v.shape.as_list())
    
    return k,q,v

def multihead_attention(k,q,v,attention_dim,heads=20,debug=False):
    if debug:
        print("multihead_attention")
        print("kqv shapes, heads", k.shape, q.shape, v.shape, heads)
    head_dim=attention_dim
    value_dim=2*head_dim
    seqlength=k.shape[1]
    dk = tf.cast(attention_dim,tf.float32)

    # project original q,k,v onto #heads
    K = tf.keras.layers.Dense(head_dim*heads, activation=None)(k)
    Q = tf.keras.layers.Dense(head_dim*heads, activation=None)(q)
    V = tf.keras.layers.Dense(value_dim*heads, activation=None)(v)

    K = tf.keras.layers.Reshape((seqlength, heads, head_dim))(K)
    Q = tf.keras.layers.Reshape((seqlength, heads, head_dim))(Q)
    V = tf.keras.layers.Reshape((seqlength, heads, value_dim))(V)

#     # Q,K,V (steps, head_dim, heads)
#     print(Q.shape, K.shape, V.shape)
#     plt.imshow(K[0,:,:])
#     plt.show()
#     plt.imshow(Q[0,:,:])
#     plt.show()
#     plt.imshow(V[0,:,:])
#     plt.show()
    
    Kt=tf.keras.layers.Permute((1,3,2))(K)
    print(Q.shape.as_list(), Kt.shape.as_list())
    attention_logits = tf.tensordot(Q, Kt, [3,2]) / tf.sqrt(dk)
    if debug:
        print("Query shape, Keys' shape, attention_logits shape")
        print(Q.shape.as_list(), Kt.shape.as_list(), attention_logits.shape.as_list())
    # attention (steps, heads, heads)
    attention = tf.nn.softmax(attention_logits, dim=2)
    #if debug:
        # investigate here
        #print("sum attention softmax")
        #print(tf.reduce_sum(tf.reduce_sum(attention, (2,)), (0,)))
    # selected_values (steps, heads, value_dim)
    # each head selects one value as a linear combination of the original values
    print(attention.shape, V.shape)
    selected_values = tf.keras.backend.dot(attention, V)
    # concat (steps, heads*value_dim)
    concat = tf.keras.layers.Reshape((seqlength, heads*value_dim))(selected_values)

    # project back into the dimension of the attention_vector
    output = tf.keras.layers.Dense(units=attention_dim, activation=None)(concat)
    if debug:
        print("attention:", attention.shape, ", selected_values:", selected_values.shape, ", concat:", concat.shape, ", output:", output.shape)

    return output

attention_dim=4
t=tf.convert_to_tensor(np.ones([32,10,attention_dim]), dtype=tf.float32)
k,q,v=preprocess_attention(t, True)
mh= multihead_attention(k,q,v,attention_dim,debug=True)

# the following will be mostly linear array of 100 (sum softmax outputs then along step axis)
print(mh.shape)




In [None]:

#tf.keras.backend.dot(a,b)

In [None]:
def transformer_module(input, output=None, debug=False):

    attention_dim=input.shape[-1]
    #input=tf.convert_to_tensor(input, dtype=tf.float32)
    if debug:
        print("module input",input.shape.as_list())

    res=input

    # we create query_count queries, keys and values
    if output is not None:
        k,q,v = preprocess_attention(output, debug=debug)
        output = output + multihead_attention(k,q,v,attention_dim, debug=debug)

    k,q,v = preprocess_attention(input, debug=debug)

    if output is not None:
        q = output
        res = output
        
    res = res + multihead_attention(k,q,v,attention_dim, debug=debug)
    if debug:
        print("attention_output",res.shape.as_list())

    res = tf.keras.layers.Dropout(.1)(res)

    norm = tf.contrib.layers.layer_norm(res)
    
    net = norm + tf.keras.layers.Dense(units=norm.shape[-1], activation=None)(norm)
    norm = tf.contrib.layers.layer_norm(net)

    return norm

# i=np.ones([32,1000,4], np.float32)
# M=transformer_module(i)
# print(M)

In [None]:
def encode(input, layers, debug=False):
    for i in range(layers):
        if debug:
            print("input",input.shape)
        input = transformer_module(input, debug=debug)
    return input

def decode(z, output, layers):
    for i in range(layers):
        output = transformer_module(z, output, debug=debug)
    return output

def encoder_decoder(input, output, layers):
    z = encode(input,layers)
    return decode(z,output,layers)
    
def transformer_net(input, output=None, layers=1, 
                    activate=tf.keras.layers.Activation(activation='softmax'),
                    internal_dim=10, debug=False):
    #model = tf.keras.model.Sequential()
    
#     input = tf.convert_to_tensor(input, dtype=tf.float32)
#     if output is not None:
#         output = tf.convert_to_tensor(output, dtype=tf.float32)

    inpL = tf.keras.layers.Dense(units=internal_dim, activation=None)
    inp = inpL(input)
    
    # positional encoding
    lpe = tf.keras.layers.Lambda(lambda x: positional_encoding(x.shape), output_shape=inp.shape.as_list()[1:])
    print(inp.shape, lpe(inp).shape)
    inp = tf.keras.layers.add([inp,lpe(inp)])
    
    retoutput = encode(inp, layers, debug)

    if output is not None:
        output = tf.keras.layers.Masking(mask_value=0.)(output)
        output = tf.keras.layers.Dense(units=internal_dim, activation=None)(output)
        output = tf.keras.layers.add([output,lpe(output)])
        retoutput = decode(retoutput,output, layers)
    #print(retoutput.shape) #10,40,100
    retoutput = tf.keras.layers.Dense(input.shape[-1])(retoutput)
    #print(retoutput.shape) #10,40,57
    if activate is not None:
        retoutput = activate(retoutput)

    if debug:
        print(retoutput.shape)

    #retoutput = tf.keras.models.Model(tf.keras.Input(input), retoutput)    
    return retoutput
    
input=np.reshape(np.arange(0,512),(512,1),np.float32)

# output=np.zeros(input.shape,np.float32)
# output[:-1] = input[1:]




In [None]:
from tensorflow.keras.utils import get_file
import io

def prepare_dataset():
    path = get_file(
        'nietzsche.txt',
        origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
    with io.open(path, encoding='utf-8') as f:
        text = f.read().lower()
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))

    chars = sorted(list(set(text)))
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))

    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 40
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print('nb sequences:', len(sentences))

    print('Vectorization...')
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.float32)
    y = np.zeros((len(sentences), len(chars)), dtype=np.float32)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

    return x,y

X_train,y_train=prepare_dataset()
print(X_train.shape,y_train.shape)

In [None]:
print(X_train[0], y_train[0])
print(X_train[0,:,:].shape,y_train[0,:].shape)


In [None]:


def loss(logits,labels):
    # logits.shape:64,40,57  and logits[:,-1].shape:64,57
    print(logits.shape,logits[:,-1].shape)
    
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits[:,-1],labels=labels)
    red = tf.reduce_sum(loss)
    return red

#l = loss(X_train[0], y_train[0])
batch_size=64
epochs=90
print(X_train.shape)
X=tf.keras.Input(X_train[0].shape, batch_size=batch_size)
print("input shape", X.shape)
logits = transformer_net(X, activate=None, debug=True)
model = tf.keras.Model(inputs=(X,),outputs=logits)

l = loss(model,y)

for epoch in range(epochs):
    print("epoch ", epoch)
    for i in range(0,len(X_train),batch_size):
        with tf.GradientTape() as tape:
            l(X_train[i:i+batch_size,:,:], y_train[i:i+batch_size,:])

        print("loss: %d", l)

        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-4)
        variables=tape.watched_variables()
        grads = tape.gradient(l, variables)
        
        print(variables[1].name, variables[1], grads[1])
        optimizer.apply_gradients(zip(grads, variables),
                                    global_step=tf.train.get_or_create_global_step())


        

In [None]:

for v in variables:
    print(v)


print(container.variables())


In [None]:
import itertools

def input_fn(images, labels, epochs, batch_size):
    # Convert the inputs to a Dataset. (E)
    ds = tf.data.Dataset.from_tensor_slices((images, labels))
    # Shuffle, repeat, and batch the examples. (T)
    SHUFFLE_SIZE = 5000
    ds = ds.shuffle(SHUFFLE_SIZE).repeat(epochs).batch(batch_size)
    ds = ds.prefetch(2)
    # Return the dataset. (L)
    return ds


In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse





# def my_model(features, labels, mode, params):
#     """DNN with three hidden layers, and dropout of 0.1 probability."""
#     # Create three fully connected layers each layer having a dropout
#     # probability of 0.1.
#     #net = tf.feature_column.input_layer(features, params['input_1'])
    
#     # Compute predictions.
#     model=MyModel(features, params)
#     model.compile()
    
#     predicted_classes = tf.argmax(model.output,axis= 1)
    
#     #model = tf.keras.Model(inputs=(features['input_1'],),outputs=(logits, predicted_classes) )
    
#     if mode == tf.estimator.ModeKeys.PREDICT:
        
#         predictions = {
#             'class_ids': predicted_classes[:, tf.newaxis],
#             'probabilities': tf.nn.softmax(logits),
#             'logits': logits,
#         }
#         return tf.estimator.EstimatorSpec(mode, predictions=predictions)

#     print(logits.shape)
#     # Compute loss.
#     loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

#     # Compute evaluation metrics.
#     accuracy = tf.metrics.accuracy(labels=labels,
#                                    predictions=predicted_classes,
#                                    name='acc_op')
#     metrics = {'accuracy': accuracy}
#     tf.summary.scalar('accuracy', accuracy[1])

#     if mode == tf.estimator.ModeKeys.EVAL:
#         return tf.estimator.EstimatorSpec(
#             mode, loss=loss, eval_metric_ops=metrics)

#     # Create training op.
#     assert mode == tf.estimator.ModeKeys.TRAIN

#     optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
#     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
#     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)


def train_input_fn(X, y, batch_size):
    inputs=({'inputlayer': tf.reshape(X, [X.shape[0],-1])},y)
    
    ds = tf.data.Dataset.from_tensor_slices(inputs)
    batch= ds.shuffle(100).repeat().batch(batch_size)
    return batch
    
def eval_input_fn(X, y, batch_size):
    inputX={'inputlayer': tf.reshape(X, [X.shape[0],-1])}
    if y is None:
        # No labels, use only features.
        inputs = inputX
    else:
        inputs = (inputX, y)
        
    ds = tf.data.Dataset.from_tensor_slices(inputs)
    assert batch_size is not None, "batch_size must not be None"
    dataset = ds.batch(batch_size)
    return dataset

def MultiHeadAttention(Q,K,V):
    logits = Q * tf.transpose(K) / tf.sqrt(tf.shape(K)[1])
    sm = tf.nn.softmax(logits)
    tf.concat(sm) * V

    
def comp_model():
    logits, probs = kerasModel()
    model = tf.keras.Model(inputs, [ logits, probs])
    #loss=tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=model)

    opt = tf.train.AdamOptimizer()
    model.compile(optimizer=opt, loss='mse',
                  metrics=['accuracy'])
    return model


model = kerasModel()
    
estimator = tf.keras.estimator.model_to_estimator(model)

batch_size=32
train_steps=200#60000*4

# Fetch the data
(train_x, train_y),(test_x, test_y) = tf.keras.datasets.mnist.load_data()
train_x=train_x.astype(np.float)
train_y = train_y.astype(np.int32)
test_x = test_x.astype(np.float)
test_y = test_y.astype(np.int32)




# Train the Model.
estimator.train(
    input_fn=lambda: train_input_fn(train_x, train_y, batch_size),
    steps=train_steps)

# Evaluate the model.
eval_result = estimator.evaluate(
    input_fn=lambda: eval_input_fn(test_x, test_y, batch_size))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

# Generate predictions from the model

In [None]:
predict_x=train_x[0:32].reshape((32,-1))
print(predict_x.shape)
predictions = estimator.predict(
    input_fn=lambda:eval_input_fn(predict_x,None,
                                            batch_size=batch_size))
for pred_dict in predictions[1]:
    print(pred_dict)


for pred_dict, expec in zip(predictions, expected):
    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')

    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print(template.format(iris_data.SPECIES[class_id],
                          100 * probability, expec))