In [None]:
%tensorflow_version 2.x
#!pip3 install --upgrade pip
#!pip install -qU t5
!pip install -q git+https://github.com/google-research/text-to-text-transfer-transformer.git@1e269e72a981fde4ea64a88a0a0d8cc88871e20a #temporary fix

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://" 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.data.utils import Feature
from t5.data import sentencepiece_vocabulary

# Set the path of sentencepiece model and vocab files
# Must be the same used for the pre-trained phase
vocab_model_path = 'gs://.....model'
vocab_path = 'gs://.....vocab'


TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

DEFAULT_EXTRA_IDS = 100

def get_default_vocabulary():
  return sentencepiece_vocabulary.SentencePieceVocabulary(
      vocab_model_path, DEFAULT_EXTRA_IDS)

FEATURES = {
    "inputs": Feature(vocabulary=get_default_vocabulary(), add_eos=True),
    "targets": Feature(vocabulary=get_default_vocabulary(), add_eos=True)
}

In [None]:
# Dataset sizes:
#   - training: 92476
#   - eval:     11560
#   - test:     11559   



DATA_DIR = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/tp_ident")


nq_tsv_path_mutant = {
    "train": os.path.join(DATA_DIR, "training.tsv"),
    "validation": os.path.join(DATA_DIR, "test.tsv")
}

num_nq_examples_mutants = dict(train=92476, validation=11559)

In [None]:
def nq_dataset_mutant(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_mutant[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["fixed", "buggy"], ex)))
  return ds

print("A few raw valid examples...")
for ex in tfds.as_numpy(nq_dataset_mutant("validation").take(5)):
  print(ex)


In [None]:
def mutant_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        x_input = tf.strings.lower(ex['fixed'])
        y_label = tf.strings.lower(ex['buggy']) 
        inputs = tf.strings.join(['generate mutant: ' + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
t5.data.TaskRegistry.remove('mutants')
t5.data.TaskRegistry.add(
    "mutants",
    dataset_fn=nq_dataset_mutant,
    splits=["train", "validation"],
    text_preprocessor=[mutant_preprocessing],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.bleu],
    num_input_examples=num_nq_examples_mutants
)

In [None]:
nq_task = t5.data.TaskRegistry.get("mutants")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#FIRST TASK CREATED

In [None]:
# Dataset sizes:
#   - training: 46680
#   - eval:     5835
#   - test:     5835  


DATA_DIR_1 = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/small/")

nq_tsv_path_bfp_small = {
    "train": os.path.join(DATA_DIR_1, "training.tsv"),
    "validation": os.path.join(DATA_DIR_1, "test.tsv"),
}

num_nq_examples_bfp_small = dict(train=46680, validation=5835)

In [None]:
def nq_dataset_bfp_small(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_bfp_small[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["buggy", "fixed"], ex)))
  return ds

print("A few raw valid examples...")
for ex in tfds.as_numpy(nq_dataset_bfp_small("validation").take(5)):
  print(ex)

In [None]:
def bfp_preprocessing_small(ds):
  
  def to_inputs_and_targets(ex):
        x_input = tf.strings.lower(ex['buggy'])
        y_label = tf.strings.lower(ex['fixed']) 
        inputs = tf.strings.join(['generate small patch: '  + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

t5.data.TaskRegistry.remove('bfp_small')
t5.data.TaskRegistry.add(
    "bfp_small",
    dataset_fn=nq_dataset_bfp_small,
    splits=["train", "validation"],
    text_preprocessor=[bfp_preprocessing_small],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples = num_nq_examples_bfp_small
)


In [None]:
nq_task = t5.data.TaskRegistry.get("bfp_small")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#SECOND TASK CREATED

In [None]:
# Dataset sizes:
#   - training: 52364
#   - eval:     6546
#   - test:     6545  

DATA_DIR_1 = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/medium/")


nq_tsv_path_bfp_medium = {
    "train": os.path.join(DATA_DIR_1, "training.tsv"),
    "validation": os.path.join(DATA_DIR_1, "test.tsv"),
}

num_nq_examples_bfp_medium = dict(train=52364, validation=6545)

In [None]:
def nq_dataset_bfp_medium(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_bfp_medium[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["buggy", "fixed"], ex)))
  return ds

print("A few raw valid examples...")
for ex in tfds.as_numpy(nq_dataset_bfp_medium("validation").take(5)):
  print(ex)

In [None]:
def bfp_preprocessing_medium(ds):
  
  def to_inputs_and_targets(ex):
        x_input = tf.strings.lower(ex['buggy'])
        y_label = tf.strings.lower(ex['fixed']) 
        inputs = tf.strings.join(['generate medium patch: '  + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

DATASET_TYPE='medium'

t5.data.TaskRegistry.remove('bfp_medium')
t5.data.TaskRegistry.add(
    "bfp_medium",
    dataset_fn=nq_dataset_bfp_medium,
    splits=["train", "validation"],
    text_preprocessor=[bfp_preprocessing_medium],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples = num_nq_examples_bfp_medium
)

In [None]:
nq_task = t5.data.TaskRegistry.get("bfp_medium")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#THIRD TASK CREATED

In [None]:
# Dataset sizes:
#   - training: 126477
#   - eval:     15809
#   - test:     15810  

DATA_DIR_1 = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/abt/")

nq_tsv_path_assert_abt = {
    "train": os.path.join(DATA_DIR_1, "training.tsv"),
    "validation": os.path.join(DATA_DIR_1, "test.tsv"),
}

num_nq_examples_assert_abt = dict(train=126477, validation=15810)

In [None]:
def nq_dataset_assert_abt(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_assert_abt[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["method", "assert"], ex)))
  return ds

print("A few raw valid examples...")
for idx,ex in enumerate(tfds.as_numpy(nq_dataset_assert_abt("validation").take(5))):
  print(ex)

In [None]:
def atlas_preprocessing_abt(ds):
  
  def to_inputs_and_targets(ex):

        x_input = tf.strings.lower(ex['method'])
        y_label = tf.strings.lower(ex['assert']) 
        inputs = tf.strings.join(['generate abt assert: ' + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

t5.data.TaskRegistry.remove('assert_abt')
t5.data.TaskRegistry.add(
    "assert_abt",
    dataset_fn=nq_dataset_assert_abt,
    splits=["train", "validation"],
    text_preprocessor=[atlas_preprocessing_abt],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples = num_nq_examples_assert_abt
)


In [None]:
nq_task = t5.data.TaskRegistry.get("assert_abt")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#FOURTH TASK CREATED

In [None]:
# Dataset sizes:
#   - training: 150523
#   - eval:     18816
#   - test:     18815  

DATA_DIR_1 = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/raw/")

nq_tsv_path_assert_raw = {
    "train": os.path.join(DATA_DIR_1, "training.tsv"),
    "validation": os.path.join(DATA_DIR_1, "test.tsv"),
}

num_nq_examples_assert_raw = dict(train=150523, validation=18815)

In [None]:
def nq_dataset_assert_raw(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_assert_raw[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["method", "assert"], ex)))
  return ds

print("A few raw valid examples...")
for idx,ex in enumerate(tfds.as_numpy(nq_dataset_assert_raw("validation").take(5))):
  print(ex)

In [None]:
def atlas_preprocessing_raw(ds):
  
  def to_inputs_and_targets(ex):

        x_input = tf.strings.lower(ex['method'])
        y_label = tf.strings.lower(ex['assert']) 
        inputs = tf.strings.join(['generate raw assert: ' + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

ASSERT_TYPE='raw'

t5.data.TaskRegistry.remove('assert_raw')
t5.data.TaskRegistry.add(
    "assert_raw",
    dataset_fn=nq_dataset_assert_raw,
    splits=["train", "validation"],
    text_preprocessor=[atlas_preprocessing_raw],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples = num_nq_examples_assert_raw
)


In [None]:
nq_task = t5.data.TaskRegistry.get("assert_raw")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#FIFTH TASK CREATED

In [None]:
# Dataset sizes:
#   - training: 1953940
#   - eval:     104272
#   - test:     90908  

DATA_DIR_1 = os.path.join(BASE_DIR, "data/datasets/finetuning-ds/comment/")

nq_tsv_path_comment = {
    "train": os.path.join(DATA_DIR_1, "training.tsv"),
    "validation": os.path.join(DATA_DIR_1, "test.tsv"),
}

num_nq_examples_comment = dict(train=1953940, validation=90908)

In [None]:
def nq_dataset_comment(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_comment[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["method", "comment"], ex)))
  return ds

print("A few raw valid examples...")
for idx,ex in enumerate(tfds.as_numpy(nq_dataset_comment("validation").take(5))):
  print(ex)

In [None]:
def preprocessing_comment(ds):
  
  def to_inputs_and_targets(ex):
        x_input = tf.strings.lower(ex['method'])
        y_label = tf.strings.lower(ex['comment']) 
        inputs = tf.strings.join(['generate comment: '  + x_input], separator=' ')
        class_label = tf.strings.join([y_label], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

t5.data.TaskRegistry.remove('comments')
t5.data.TaskRegistry.add(
    "comments",
    dataset_fn=nq_dataset_comment,
    splits=["train", "validation"],
    text_preprocessor=[preprocessing_comment],
    output_features=FEATURES,
    metric_fns=[t5.evaluation.metrics.bleu, t5.evaluation.metrics.rouge],
    num_input_examples = num_nq_examples_comment
)

In [None]:
nq_task = t5.data.TaskRegistry.get("comments")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

#SIXTH TASK CREATED

In [None]:
#The following function implements proportional sampling
#Here the idea is to create batches according to the dataset size wrt the each task
#By doing so, we should be able to cope with overfitting

def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))


t5.data.MixtureRegistry.remove("all_tasks")
t5.data.MixtureRegistry.add(
    "all_tasks",
    ["bfp_small", "bfp_medium", "assert_abt", "assert_raw", "mutants","comments"],
    default_rate=_rate_num_input_examples
     #default_rate=1.0
)

In [None]:
from mesh_tensorflow.transformer import learning_rate_schedules


MODEL_SIZE = "small" 

# Set the folder where the checkpoints and all the others information will be writed
MODEL_DIR = 'gs://...../'

# Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
#PRETRAINED_DIR='gs://...../pretrained_model/'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule=learning_rate_schedules.slanted_triangular,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

In [None]:
#RUN FINE-TUNING
FINETUNE_STEPS = 2000000

model.finetune(
    mixture_or_task_name="all_tasks",
    pretrained_model_dir=MODEL_DIR,
    finetune_steps=FINETUNE_STEPS
)

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="all_tasks",
    checkpoint_steps=-1 #evaluate only last checkpoint
)

In [None]:
if ON_CLOUD:
  %reload_ext tensorboard
  import tensorboard as tb
tb.notebook.start("--logdir " + MODEL_DIR)