# Code search debug setup

Here's an example setup for debugging aspects of the code_search example. This should make it easier to debug and write tests for various aspects of the model as well as provide a simple interface for exploring its performance during development.

Fairly similar to [hello_t2t.ipynb](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/hello_t2t.ipynb).

## Imports

In [1]:

import csv
from six import StringIO
import tempfile

from tensor2tensor.data_generators import problem
from tensor2tensor.layers import common_layers
from tensor2tensor.models import transformer
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model

from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import metrics

import tensorflow as tf

import tk

from tensorflow.contrib.eager.python import tfe
tfe.enable_eager_execution()
Modes = tf.estimator.ModeKeys


Instructions for updating:
Use the retry module or similar alternatives.


## Datagen

In [6]:

@registry.register_problem
class GithubFunctionDocstring(text_problems.Text2TextProblem):
  """Function and Docstring similarity Problem.
  This problem contains the data consisting of function
  and docstring pairs as CSV files. The files are structured
  such that they contain two columns without headers containing
  the docstring tokens and function tokens. The delimiter is
  ",".
  """

  DATA_PATH_PREFIX = "gs://kubeflow-examples/t2t-code-search/raw_data"

  @property
  def pair_files_list(self):
    """Return URL and file names.
    This format is a convention across the Tensor2Tensor (T2T)
    codebase. It should be noted that the file names are currently
    hardcoded. This is to preserve the semantics of a T2T problem.
    In case a change of these values is desired, one must subclass
    and override this property.
    # TODO(sanyamkapoor): Manually separate train/eval data set.
    Returns:
      A list of the format,
        [
          [
            "STRING",
            ("STRING", "STRING", ...)
          ],
          ...
        ]
      Each element is a list of size 2 where the first represents
      the source URL and the next is an n-tuple of file names.
      In this case, the tuple is of size 1 because the URL points
      to a file itself.
    """
    return [
        [
            "{}/func-doc-pairs-000{:02}-of-00100.csv".format(
                self.DATA_PATH_PREFIX, i),
            ("func-doc-pairs-000{:02}-of-00100.csv".format(i),)
        ]
        for i in range(1)
    ]

  @property
  def is_generate_per_split(self):
    return False

  @property
  def approx_vocab_size(self):
    return 2**13

  @property
  def max_samples_for_vocab(self):
    # FIXME(sanyamkapoor): This exists to handle memory explosion.
    return int(2e5)

  def get_csv_files(self, _data_dir, tmp_dir, _dataset_split):
    return [
        generator_utils.maybe_download(tmp_dir, file_list[0], uri)
        for uri, file_list in self.pair_files_list
    ]

  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """A generator to return data samples.Returns the data generator to return.
    Args:
      data_dir: A string representing the data directory.
      tmp_dir: A string representing the temporary directory and is
              used to download files if not already available.
      dataset_split: Train, Test or Eval.
    Yields:
      Each element yielded is of a Python dict of the form
        {"inputs": "STRING", "targets": "STRING", "embed_code": [0]}
    """
    csv_files = self.get_csv_files(data_dir, tmp_dir, dataset_split)

    for pairs_file in csv_files:
      tf.logging.debug("Reading {}".format(pairs_file))
      with tf.gfile.Open(pairs_file) as csv_file:
        for line in csv_file:
          reader = csv.reader(StringIO(line))
          for docstring_tokens, function_tokens in reader:
            yield {
                "inputs": docstring_tokens,
                "targets": function_tokens,
                "embed_code": [0]
            }

  def example_reading_spec(self):
    data_fields, data_items_to_decoders = super(GithubFunctionDocstring,
                                                self).example_reading_spec()
    data_fields["embed_code"] = tf.FixedLenFeature([1], dtype=tf.int64)

    data_items_to_decoders = {
      "inputs": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="inputs"),
      "targets": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="targets"),
      "embed_code": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="embed_code")
    }
    return data_fields, data_items_to_decoders

  def eval_metrics(self):  # pylint: disable=no-self-use
    return [
        metrics.Metrics.ACC
    ]


In [17]:

# Your paths here!

tmp_dir = "/mnt/nfs-east1-d/cs/tmp"
tf.gfile.MakeDirs(tmp_dir)

data_dir = "/mnt/nfs-east1-d/cs/data"
tf.gfile.MakeDirs(data_dir)


In [None]:

problem_object = GithubFunctionDocstring()

problem_object.generate_data(data_dir, tmp_dir)


## Examine examples

In [55]:
import pprint
import numpy as np

In [32]:

example = tfe.Iterator(problem_object.dataset(Modes.TRAIN, data_dir)).next()

pprint.pprint(example)


INFO:tensorflow:Reading data files from /mnt/nfs-east1-d/cs/data/github_function_docstring-train*


[2018-09-28 20:04:06,708] Reading data files from /mnt/nfs-east1-d/cs/data/github_function_docstring-train*


INFO:tensorflow:partition: 0 num_data_files: 100


[2018-09-28 20:04:06,802] partition: 0 num_data_files: 100


{'embed_code': <tf.Tensor: id=7345, shape=(1,), dtype=int64, numpy=array([0])>,
 'inputs': <tf.Tensor: id=7346, shape=(14,), dtype=int64, numpy=
array([2292, 2525,   61, 2707,   45, 1359,   24, 7081,  171, 1760,   49,
       1211,   18,    1])>,
 'targets': <tf.Tensor: id=7347, shape=(271,), dtype=int64, numpy=
array([   7,   37,    2, 2292, 2525,   61,    2, 7081,  102,  813,   10,
       1651, 4209, 4921,   12, 2965, 1005,   28,  641, 2410,   44,   67,
       5145,  792,    5,  127, 4954, 4590,  627,  145, 1853, 1836,   22,
        296,  605, 2410,   98,  477, 2847, 1121,  160,  228, 3088, 5555,
        169,  444, 1905,  109, 7781,    2,  102,  145, 7081, 1373,   45,
        228,  223, 1382,  477, 6201, 2613,  553,  822, 3029, 3029,   67,
       7081, 1373,   45,  444,  223, 1382,  477, 6201, 2613,  553,  822,
       3029,  822,  228, 7081, 1373,   45,  127,  223, 1382,  477, 6201,
       2613,  553,  822, 2099,    4, 3028,    4,  228, 7081, 1373,   45,
        296,  223, 1382,  477,

In [53]:

vocab_name = "vocab.github_function_docstring.8192.subwords"

# Get the encoders from the problem
encoders = problem_object.feature_encoders(data_dir)

# Setup helper functions for encoding and decoding
def encode(input_str, output_str=None):
  """Input str to features dict, ready for inference"""
  inputs = encoders["inputs"].encode(input_str) + [1]  # add EOS id
  batch_inputs = tf.reshape(inputs, [1, -1, 1])  # Make it 3D.
  return {"inputs": batch_inputs}

def decode(integers):
  """List of ints to str"""
  integers = list(np.squeeze(integers))
  if 1 in integers:
    integers = integers[:integers.index(1)]
  return encoders["inputs"].decode(np.squeeze(integers))


In [56]:

print(decode(example["inputs"]))


try multiple times to run ' throw_random '


In [60]:

print(decode(example["targets"]))


def throw_random lengths mask saved None for i in range maxtries try return throw_random_bits lengths mask except MaxtriesException as e saved e continue raise e


## Train setup

In [None]:

# Copying the model into your notebook makes it easier to debug and means you don't
# need to repeatedly re-load the entire t2t registry with each change.


def _get_initializer(_):
  return None


@registry.register_model
class SimilarityTransformerDev(t2t_model.T2TModel):
  """Transformer Model for Similarity between two strings.
  This model defines the architecture using two transformer
  networks, each of which embed a string and the loss is
  calculated as a Binary Cross-Entropy loss. Normalized
  Dot Product is used as the distance measure between two
  string embeddings.
  """

  def top(self, body_output, _):
    return body_output

  def body(self, features):
        
    initializer = _get_initializer(self.hparms.initializer)
    docs_encoder_trainable = self.hparams.docs_encoder_trainable
    code_encoder_trainable = self.hparams.code_encoder_trainable
        
    with tf.variable_scope('string_embedding', initializer=initializer):
      string_embedding = self.encode(features, 'inputs',
                                    trainable=docs_encoder_trainable)

    # Is this to detect whether we're in training mode?
    # Instead could use Modes key.
    if 'targets' in features:

      with tf.variable_scope('code_embedding', initializer=initializer):
        code_embedding = self.encode(features, 'targets',
                                    trainable=code_encoder_trainable)

      loss = self.loss(string_embedding, code_embedding)

      return string_embedding, {'training': loss}

    return string_embedding

  def distance(self, string_embedding, code_embedding):
    string_embedding_norm = tf.nn.l2_normalize(string_embedding, axis=1)
    code_embedding_norm = tf.nn.l2_normalize(code_embedding, axis=1)

    # All-vs-All cosine distance matrix, reshaped as row-major.
    cosine_dist = 1.0 - tf.matmul(string_embedding_norm, code_embedding_norm,
                                  transpose_b=True)
    return cosine_dist

  def loss(self, string_embedding, code_embedding):
    cosine_dist = self.distance(string_embedding, code_embedding)
    cosine_dist_flat = tf.reshape(cosine_dist, [-1, 1])

    # Positive samples on the diagonal, reshaped as row-major.
    label_matrix = tf.eye(tf.shape(cosine_dist)[0], dtype=tf.int32)
    label_matrix_flat = tf.reshape(label_matrix, [-1])

    logits = tf.concat([1.0 - cosine_dist_flat, cosine_dist_flat], axis=1)
    labels = tf.one_hot(label_matrix_flat, 2)

    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
                                                   logits=logits)
    return loss
    
  # Change to "embed"?
  def encode(self, features, input_key):
    hparams = self._hparams
    inputs = common_layers.flatten4d3d(features[input_key])

    (encoder_input, encoder_self_attention_bias, _) = (
        transformer.transformer_prepare_encoder(inputs, problem.SpaceID.EN_TOK,
                                                hparams))

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    encoder_output = transformer.transformer_encoder(
        encoder_input,
        encoder_self_attention_bias,
        hparams,
        nonpadding=transformer.features_to_nonpadding(features, input_key))

    encoder_output = tf.reduce_mean(encoder_output, axis=1)

    return encoder_output


In [20]:

from tensor2tensor.models.transformer import transformer_base

def similarity_transformer_tiny():
  hparams = transformer_base()
  hparams.num_hidden_layers = 2
  hparams.hidden_size = 128
  hparams.filter_size = 512
  hparams.num_heads = 4
  hparams.docs_encoder_trainable = True
  hparams.code_encoder_trainable = True
  hparms.initializer = None
  return hparams


In [21]:

hparams = similarity_transformer_tiny()
hparams.data_dir = data_dir

p_hparams = problem_object.get_hparams(hparams)

model = SimilarityTransformer(
    hparams, tf.estimator.ModeKeys.TRAIN, p_hparams
)

batch_size = 1
train_dataset = problem_object.dataset(Modes.TRAIN, data_dir)
train_dataset = train_dataset.repeat(None).batch(batch_size)

optimizer = tf.train.AdamOptimizer()


INFO:tensorflow:Setting T2TModel mode to 'train'


[2018-09-28 20:00:55,412] Setting T2TModel mode to 'train'


INFO:tensorflow:Reading data files from /mnt/nfs-east1-d/cs/data/github_function_docstring-train*


[2018-09-28 20:00:55,417] Reading data files from /mnt/nfs-east1-d/cs/data/github_function_docstring-train*


INFO:tensorflow:partition: 0 num_data_files: 100


[2018-09-28 20:00:55,560] partition: 0 num_data_files: 100


## E2E test that combines train and eval steps

In [None]:

@tfe.implicit_value_and_gradients
def loss_fn(features):
  _, losses = model(features)
  return losses["training"]

NUM_STEPS = 10

for count, example in enumerate(tfe.Iterator(train_dataset)):
  loss, gv = loss_fn(example)
  optimizer.apply_gradients(gv)

  if count % 1 == 0:
   print("Step: %d, Loss: %.3f" % (count, loss.numpy()))
  if count >= NUM_STEPS:
   break

model.set_mode(Modes.EVAL)
dataset = problem_object.dataset(Modes.EVAL, data_dir)

example = tfe.Iterator(dataset).next()

encoded, _ = model.encode(example)


In [None]:

# TODO: Doesn't like data type, maybe add float cast in body.


## Debug distance method

In [None]:

#TODO: Not implemented, break out model.distance to enable easier inspection of calculated distances.


In [None]:

query = "hello world"
code = "def my_function(query):  print(query)"

d = model.distance(model.embed(encode(query)), model.embed(encode(code)))

print("Distance: %s" % d)


## Initiate a long training run

Assuming relevant model and problem versions are in t2t_usr_dir

In [None]:

# Train locally, e.g. using tiny hparams to check that things run okay.

!t2t-trainer --t2t_usr_dir=/mnt/nfs-east1-d/work/tk/tk \
    ...


In [None]:

# TODO: Your favorite way to launch jobs here, e.g. Faring, ksonnet, a Python
# wrapper around ksonnet, etc.

# In my case I'm currently using this (unsupported) library

args = tk.configure_experiment("cs-dev0",
                             problem="github_function_docstring",
                             num_gpu_per_worker=1,
                             hparams_set="transformer_tiny",
                             model="similarity_transformer",
                             extra_hparams={
                             },
                             num_steps=10000)

job = tk.experiment.T2TExperiment(**args)
job.run()


## Examine trained model

In [None]:

# The path to checkpoints for the newly trained model, accessible to local FS

ckpt_path = "...ckpt-NNN"


#### Compute the distance for pair and non-pair

In [None]:

# TODO: Artificially construct pair and non-pair

pair = {...}
non_pair = {...}

for ex in [pair, non_pair];
  ex["inputs"] = encode(ex["inputs"])
  ex["targets"] = encode(ex["targets"])

with tfe.restore_variables_on_create(ckpt_path):

    pair_input_emb = model.encode(pair["inputs"])
    pair_target_emb = model.encode(pair["targets"])
    print("Dist for true pair: %s" % model.distance(pair_input_emb, pair_target_emb))

    non_pair_input_emb = model.encode(pair["inputs"])
    non_pair_target_emb = model.encode(pair["targets"])
    print("Dist for false pair: %s" % model.distance(non_pair_input_emb, non_pair_target_emb))


#### Re-build the search index with the new model

In [None]:

# TODO: For the sake of model development, it would be nice if this could be done with a single command,
# provided the path to new checkpoints, and be triggered from the notebook.

def rebuild_index(ckpt_path):
  """Trigger a re-build of the search index.
  
  i.e. triggers external infrastructure to re-compute embeddings
  
  Returns:
      index or index ID?
  """
  return index_id?

index_id = rebuild_index(ckpt_path)


#### Run queries against the index

In [None]:

def search(query, ckpt_path, index_id)
  encoded_inputs = encode(query)
  with tfe.restore_variables_on_create(ckpt_path):
    query_embedding = model.encode(encoded_inputs)["outputs"]
    
  # TODO: Search `query_embedding` against index with `index_id`

  return hits


query = "hello world"

hits = search(query, ckpt_path)

print("Query: %s" % query)
print("Hits:")
pprint.pprint(hits)


#### Compute interpretable quality measures

In [None]:

# TODO: Trigger calculation of an interpretable measure of quality, along
# the lines of
# https://github.com/kubeflow/examples/issues/254#issuecomment-425606539

def compute_quality(ckpt_path, index_id):
  pass

compute_quality()
