# CS CE problem v2


* Autoencoding problems

* Crossover problems

* Encoder and decoder scoping by modality (either input or target)

* Dense layer to map code tensors into 128-dimensional vectors.


In [1]:

import numpy as np

import tensorflow as tf

from tensor2tensor.data_generators import problem
from tensor2tensor.layers import common_layers
from tensor2tensor.models import transformer
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model

from tensor2tensor.models import transformer

from tensor2tensor.models.transformer import transformer_base

from tensorflow.contrib.eager.python import tfe
tfe.enable_eager_execution()
Modes = tf.estimator.ModeKeys


from tk.models import similarity_transformer
from tk.data_generators import function_docstring

import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline


In [26]:

def scoped_flexible_encode(self, inputs, target_space, hparams, scope_name,
                           features_key, features=None, losses=None):

    with tf.variable_scope(scope_name):

        inputs = common_layers.flatten4d3d(inputs)

        encoder_input, self_attention_bias, encoder_decoder_attention_bias = (
            transformer_prepare_encoder(
                inputs, target_space, hparams, features=features))

        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
            value=hparams.layer_prepostprocess_dropout)

        encoder_input = tf.nn.dropout(encoder_input,
                                      1.0 - hparams.layer_prepostprocess_dropout)

        encoder_output = transformer_encoder(
            encoder_input,
            self_attention_bias,
            hparams,
            nonpadding=features_to_nonpadding(features, features_key),
            save_weights_to=self.attention_weights,
            make_image_summary=not common_layers.is_xla_compiled(),
            losses=losses)

        return encoder_output, encoder_decoder_attention_bias


@registry.register_model
class ConstrainedEmbeddingTransformerV2(transformer.Transformer):

  def scoped_encoder(self, tensor, scope_name, features,
                     target_space=problem.SpaceID.EN_TOK):
    hparams = self._hparams
    with tf.variable_scope(scope_name):
      return self.encode(tensor, target_space, hparams,
                         features=features)

  def scoped_decoder(self, encoder_output, scope_name, features, encoder_decoder_attention_bias):
    
    hparams = self._hparams
    targets = features["targets"]
    targets_shape = common_layers.shape_list(targets)
    targets = common_layers.flatten4d3d(targets)
    
    losses=None #?
    
    with tf.variable_scope(scope_name):

      decoder_input, decoder_self_attention_bias = transformer.transformer_prepare_decoder(
        targets, hparams, features=features)

      decoder_output = self.decode(
        decoder_input,
        encoder_output,
        encoder_decoder_attention_bias,
        decoder_self_attention_bias,
        hparams,
        nonpadding=transformer.features_to_nonpadding(features, "targets"),
        losses=losses)

    ret = tf.reshape(decoder_output, targets_shape)
    
    return ret

  def string_encoder(self, tensor, features):
    return self.scoped_encoder(tensor, "string_encoder", features)

  def code_encoder(self, tensor, features):
    return self.scoped_encoder(tensor, "code_encoder", features)

  def string_decoder(self, tensor, features, attn_bias):
    return self.scoped_decoder(tensor, "string_decoder", features, attn_bias)

  def code_decoder(self, tensor, features, attn_bias):
    return self.scoped_decoder(tensor, "code_decoder", features, attn_bias)

  def maybe_predict(self, features):
    if self._hparams.mode == tf.estimator.ModeKeys.PREDICT:
      if self._hparams.predict_mode == "code":
        code_emb_raw, _ = self.code_encoder(features["inputs"], features)
        return self.scoped_dense(code_emb_raw, "dense_code"), {"training": 0.0} # Not necessary?
      elif self._hparams.predict_mode == "docstring":
        string_emb_raw, _ = self.string_encoder(features["inputs"], features)
        return self.scoped_dense(string_emb_raw, "dense_string"), {"training": 0.0}
      else:
        return None

  def scoped_dense(self, emb, scope_name):
    with tf.variable_scope(scope_name):
      return tf.nn.dense(emb)

  def body(self, features):
    """Perhaps overly complicated "constrained embedding cross-/auto-encoder"......

    - Expects features {"inputs": doc string, "targets": code}
    - Computes encoder/decoder losses (d->c, c->d, c->c, d->c)
    - Computes embedding similarity loss (e(d)~e(c)?)
    - During training, doesn't return a tensor result
    - At inference time, returns either code or string embedding
      (reduced to single vector) depending on hparams.predict_input_modality.

    """
    
    prediction = self.maybe_predict(features)
    if prediction is not None:
      return prediction

    losses = {}

    string_emb_raw, string_attn_bias = self.string_encoder(features["inputs"], features)
    code_emb_raw, code_attn_bias = self.code_encoder(features["targets"], features)
    
    # Compute the auto- and cross-mappings
    code_from_string = self.code_decoder(string_emb_raw, features, string_attn_bias)
    
    
    #string_from_string = self.string_decoder(string_emb_raw, string_attn_bias)
    #code_from_code = self.code_decoder(code_emb_raw, code_attn_bias)
    #string_from_code = self.string_decoder(code_emb_raw, code_attn_bias)
    
    cfs = self.top(code_from_string, features)
    cfs = self.loss(cfs, features)
    
    return code_from_string
    
    #cfs = self.loss(cfs, features)
    
    #losses.update({"code_from_string": cfs})
    
    """
    # Compute the embedding similarity loss
    losses.update(compute_similarity_costs(
        self.reduce_string_emb(string_emb_raw),
        self.reduce_code_emb(code_emb_raw),
        self._hparams))
    
    """

    # HACK
    losses["training"] = sum(losses.values())

    return None, losses


In [27]:

mp_constrained_embedding = function_docstring.GithubConstrainedEmbedding()

data_dir = "/mnt/nfs-east1-d/data"

hparams = similarity_transformer.similarity_transformer_tiny()
hparams.data_dir = data_dir

p_hparams = mp_constrained_embedding.get_hparams(hparams)

model = ConstrainedEmbeddingTransformerV2(
    hparams, tf.estimator.ModeKeys.TRAIN, p_hparams
)

# Get the encoders from the problem
encoders = mp_constrained_embedding.feature_encoders(data_dir)

# Setup helper functions for encoding and decoding
def encode(input_str, output_str=None):
  """Input str to features dict, ready for inference"""
  inputs = encoders["inputs"].encode(input_str) + [1]  # add EOS id
  batch_inputs = tf.reshape(inputs, [1, -1, 1])  # Make it 3D.
  return {"inputs": batch_inputs}

def decode(integers):
  """List of ints to str
  
  For decoding an integer encoding to its string representation,
  not for decoding an embedding vector into the same.
  """
  integers = list(np.squeeze(integers))
  if 1 in integers:
    integers = integers[:integers.index(1)]
  return encoders["inputs"].decode(np.squeeze(integers))

batch_size = 1
train_dataset = mp_constrained_embedding.dataset(Modes.PREDICT, data_dir)
train_dataset = train_dataset.repeat(None).batch(batch_size)

iterator = tfe.Iterator(train_dataset)


INFO:tensorflow:Reading data files from /mnt/nfs-east1-d/data/github_function_docstring-dev*


[2018-10-29 22:22:47,074] Reading data files from /mnt/nfs-east1-d/data/github_function_docstring-dev*


INFO:tensorflow:partition: 0 num_data_files: 1


[2018-10-29 22:22:47,081] partition: 0 num_data_files: 1


In [28]:
  
optimizer = tf.train.AdamOptimizer()

@tfe.implicit_value_and_gradients
def loss_fn(features):
    _, losses = model(features)
    return losses["training"]

NUM_STEPS = 10

for count, example in enumerate(iterator):
    loss, gv = loss_fn(example)
    optimizer.apply_gradients(gv)

    if count % 1 == 0:
        print("Step: %d, Loss: %.3f" % (count, loss.numpy()))
    if count >= NUM_STEPS:
       break


InvalidArgumentError: Input to reshape is a tensor with 271722 values, but the requested shape has 34780416 [Op:Reshape] name: padded_cross_entropy_size_check