# All

## Set up

In [1]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

Installing dependencies...
Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
[K     |████████████████████████████████| 153 kB 4.8 MB/s 
[K     |████████████████████████████████| 4.5 MB 48.1 MB/s 
[K     |████████████████████████████████| 4.7 MB 34.0 MB/s 
[K     |████████████████████████████████| 4.6 MB 45.1 MB/s 
[K     |████████████████████████████████| 306 kB 52.7 MB/s 
[K     |████████████████████████████████| 385 kB 54.6 MB/s 
[K     |████████████████████████████████| 1.3 MB 47.6 MB/s 
[K     |████████████████████████████████| 116 kB 57.5 MB/s 
[K     |████████████████████████████████| 596 kB 47.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.4 MB/s 
[K     |████████████████████████████████| 101 kB 9.6 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.8 kB/s 
[K     |████████████████████████████████| 5.8 MB 35.5 MB/s 
[K     |████████████████████████████████| 438 kB 41.6 MB/s 
[K     |████████████████████████████████| 1.6 MB

In [2]:
!pip install -U tensorflow-gcs-config==2.9.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gcs-config==2.9.1
  Downloading tensorflow_gcs_config-2.9.1-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.4 MB/s 
[?25hInstalling collected packages: tensorflow-gcs-config
  Attempting uninstall: tensorflow-gcs-config
    Found existing installation: tensorflow-gcs-config 2.8.0
    Uninstalling tensorflow-gcs-config-2.8.0:
      Successfully uninstalled tensorflow-gcs-config-2.8.0
Successfully installed tensorflow-gcs-config-2.9.1


In [3]:
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v3-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU zdetection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_service_account()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)


Setting up GCS access...
Running on TPU: grpc://10.43.157.194:8470


Instructions for updating:
non-resource variables are not supported in the long term


Successfully saved credentials for daim-938@local-shoreline-357513.iam.gserviceaccount.com


In [4]:
# print(mesh_tensorflow.__version__)

In [5]:
print(t5.__version__)

0.9.3


In [6]:
# import gin
# import subprocess
# gin.parse_config_file(
#         'gs://t5-data/pretrained_models/base/operative_config.gin'
#     )


## Register Tasks Medium


In [7]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/bug2fix/medium/train.tsv',
            'gs://cotext/data/bug2fix/medium/valid.tsv',

            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )

    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["medium: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'public static TYPE_1 init ( java.lang.String name , java.util.Date date )  OPEN_CURLY_TOKEN  TYPE_1 VAR_1 = new TYPE_1 ( ) ; VAR_1 . METHOD_1 ( name ) ; java.util.Calendar VAR_2 = java.util.Calendar.getInstance ( ) ; VAR_2 . METHOD_2 ( date ) ; VAR_1 . METHOD_3 ( VAR_2 ) ; return VAR_1 ;  CLOSE_CURLY_TOKEN ', 'target': b'public static TYPE_1 init ( java.lang.String name , java.util.Date date )  OPEN_CURLY_TOKEN  TYPE_1 VAR_1 = new TYPE_1 ( ) ; VAR_1 . METHOD_1 ( name ) ; java.util.Calendar VAR_2 = null ; if ( date != null )  OPEN_CURLY_TOKEN  VAR_2 = java.util.Calendar.getInstance ( ) ; VAR_2 . METHOD_2 ( date ) ;  CLOSE_CURLY_TOKEN  VAR_1 . METHOD_3 ( VAR_2 ) ; return VAR_1 ;  CLOSE_CURLY_TOKEN '}
{'input': b'public TYPE_1 METHOD_1 ( java.lang.String name )  OPEN_CURLY_TOKEN  if ( name . equals ( STRING_1 ) ) return new TYPE_2 ( STRING_2 , true ) ; if ( name . equals ( STRING_3 ) ) return new TYPE_3 ( STRING_4 , true ) ; if ( name . equals 

In [8]:
t5.data.TaskRegistry.remove('medium')
t5.data.TaskRegistry.add(
    "medium",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
    # Lowercase targets before computing metrics.
    # We'll use accuracy as our evaluation metric.
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab)),

    # metric_fns=[t5.evaluation.metrics.accuracy, 
    #            t5.evaluation.metrics.sequence_accuracy, 
    #             ],
    # output_features=t5.data.Feature(vocabulary=t5.data.SentencePieceVocabulary(vocab))
)

<t5.data.dataset_providers.FunctionTask at 0x7ff660b9ed90>

## Register Tasks small


In [9]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    if split == 'train':
      ds = tf.data.TextLineDataset(
            [
            'gs://cotext/data/bug2fix/small/train.tsv',
            'gs://cotext/data/bug2fix/small/valid.tsv',

            ]
          )
    else:
      ds = tf.data.TextLineDataset(
            [
            ]
          )
    # Split each "<t1>\t<t2>" example into (input), target) tuple.
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                          field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def ner_preprocessor(ds):
  def normalize_text(text):
    return text

  def to_inputs_and_targets(ex):
    """Map {"inputs": ..., "targets": ...}->{"inputs": ner..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["small: ", normalize_text(ex["input"])]),
        "targets": normalize_text(ex["target"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

print("A few raw validation examples...")
for ex in tfds.as_numpy(dumping_dataset("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'public java.lang.String METHOD_1 ( )  OPEN_CURLY_TOKEN  return new TYPE_1 ( STRING_1 ) . format ( VAR_1  OPEN_SQUARE_TOKEN  ( ( VAR_1 . length ) - 1 )  CLOSE_SQUARE_TOKEN  . getTime ( ) ) ;  CLOSE_CURLY_TOKEN ', 'target': b'public java.lang.String METHOD_1 ( )  OPEN_CURLY_TOKEN  return new TYPE_1 ( STRING_1 ) . format ( VAR_1  OPEN_SQUARE_TOKEN  ( ( type ) - 1 )  CLOSE_SQUARE_TOKEN  . getTime ( ) ) ;  CLOSE_CURLY_TOKEN '}
{'input': b'public boolean METHOD_1 ( java.lang.String name )  OPEN_CURLY_TOKEN  TYPE_1 VAR_1 = TYPE_1 . METHOD_2 ( VAR_2 ) ; return ( ! ( METHOD_3 ( name ) ) ) && ( VAR_1 . contains ( name ) ) ;  CLOSE_CURLY_TOKEN ', 'target': b'public boolean METHOD_1 ( java.lang.String name )  OPEN_CURLY_TOKEN  return ( ! ( METHOD_3 ( name ) ) ) && ( TYPE_1 . METHOD_2 ( VAR_2 ) . contains ( name ) ) ;  CLOSE_CURLY_TOKEN '}
{'input': b'public char METHOD_1 ( java.lang.String VAR_1 , java.lang.String name )  OPEN_CURLY_TOKEN  return null ;

In [10]:
t5.data.TaskRegistry.remove('small')
t5.data.TaskRegistry.add(
    "small",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=dumping_dataset,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[ner_preprocessor],
)

<t5.data.dataset_providers.FunctionTask at 0x7ff62cdffa90>

## Mixtures

In [11]:
t5.data.MixtureRegistry.remove("all_mix")
t5.data.MixtureRegistry.add(
    "all_mix",
    [
     'medium',
     'small'
     ],
     default_rate=1.0
)

<seqio.dataset_providers.Mixture at 0x7ff628c0e0d0>

## Define Model

In [12]:
# !gsutil -m rm -r {MODEL_DIR}

In [13]:
# Using pretrained_models from wiki + books
MODEL_SIZE = "base"
BASE_PRETRAINED_DIR = "gs://cotext/cc/"

PRETRAINED_DIR = BASE_PRETRAINED_DIR

MODEL_DIR = "gs://t5_training/models/code/bug2fix_base_v2/"
MODEL_DIR = os.path.join(MODEL_DIR, MODEL_SIZE)


# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule=0.001,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)


PermissionDeniedError: ignored

## Finetune

In [None]:
FINETUNE_STEPS = 45000

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    finetune_steps=FINETUNE_STEPS
)

## Predict

In [None]:
tasks = [
         ['bug2fix', 'small'],
         ['bug2fix', 'medium']
         ]
output_dir = "bug2fix_base_v2"
test_file = 'test'
%cd /content/

In [None]:
for task in tasks:
  !mkdir {task[1]}
  !gsutil cp gs://cotext/data/{task[0]}/{task[1]}/{test_file}.tsv {task[1]}/
  with open(f'{task[1]}/{test_file}.tsv', 'r') as file:
    with open(f'{task[1]}/predict_input.tsv', 'w') as predict_input:
      with open(f'{task[1]}/actual_output.tsv', 'w') as actual_output:
        for line in file:
          line = line.strip().split('\t')
          input = line[0].strip()
          
          actual = line[1].strip()
          actual = ' '.join(actual.split())
          actual = actual.strip() \
                .replace(' SMALLER_TOKEN ', ' < ') \
                .replace(' GREATER_TOKEN ', ' > ')\
                .replace(' OPEN_SQUARE_TOKEN ', ' [ ')\
                .replace(' CLOSE_SQUARE_TOKEN ', ' ] ')\
                .replace(' OPEN_CURLY_TOKEN ', ' { ')\
                .replace(' CLOSE_CURLY_TOKEN', ' } ')\
                .replace(' CLOSE_CURLY_TOKEN ', ' } ')\
                .replace(' EXPONENTIAL_TOKEN ', ' ^ ')\
                .replace(' SHARP_TOKEN ', ' # ')\
                .replace(' DOLLAR_TOKEN ', ' $ ')\
                .replace(' UNK_TOKEN ', ' ` ') \
                .replace(' NEW_LINE ', ' \\n ') \
                .replace(' INDENT ', ' \\t ')

          predict_input.write(f'{task[1]}: {input}\n')
          actual_output.write(f'{actual}\n')

In [None]:
import tensorflow.compat.v1 as tf

for task in tasks:
  dir = task[0]
  input_file = f'{task[1]}/predict_input.tsv'
  output_file = f'{task[1]}/predict_output.tsv'

  predict_inputs_path = input_file
  predict_outputs_path = output_file

  # Manually apply preprocessing by prepending "triviaqa question:".
  print(predict_inputs_path)
  print(predict_outputs_path)
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
        input_file=predict_inputs_path,
        output_file=predict_outputs_path,
        checkpoint_steps=-1,
        temperature=0,
    )

  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("Predicted task : " + dir)
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])

## Scoring

In [None]:
!git clone https://github.com/microsoft/CodeXGLUE.git
%cd /content/CodeXGLUE/Code-Code/code-refinement

In [None]:
tasks = [
         ['bug2fix', 'small'],
         ['bug2fix', 'medium']
         ]
# output_dir = "defect_detection_code_all_codesearchnet_v1"
test_file = 'test'
checkpoint = '1044900'

In [None]:
for task in tasks:
  print(task[0], task[1])
  
  with open(f'/content/{task[1]}/predict_output.tsv-{checkpoint}') as file:
    with open(f'/content/{task[1]}/predict_output.tsv', 'w') as out_file:
      for line in file:
        line = line.strip() \
                .replace('SMALLER_TOKEN', '<') \
                .replace('GREATER_TOKEN', '>')\
                .replace('OPEN_SQUARE_TOKEN', '[')\
                .replace('CLOSE_SQUARE_TOKEN', ']')\
                .replace('OPEN_CURLY_TOKEN', '{')\
                .replace('CLOSE_CURLY_TOKEN', '}')\
                .replace('EXPONENTIAL_TOKEN', '^')\
                .replace('SHARP_TOKEN', '#')\
                .replace('DOLLAR_TOKEN', '$')\
                .replace('UNK_TOKEN', '`') \
                .replace('NEW_LINE', '\\n') \
                .replace('INDENT', '\\t')
        out_file.write(f'{line}\n')
  
  !python evaluator/evaluator.py -ref /content/{task[1]}/actual_output.tsv -pre /content/{task[1]}/predict_output.tsv


In [None]:
!pip install tree_sitter==0.2.0
%cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU

for task in tasks:
  print(task[0], task[1])
  !python calc_code_bleu.py --refs /content/{task[1]}/actual_output.tsv --hyp /content/{task[1]}/predict_output.tsv --lang java