# Set Up

In [1]:
from IPython.display import clear_output 
!pip install gcsfs
!pip install t5==0.9.2
!pip install -q tensorflow-text==2.8.0rc0
clear_output()

In [2]:
print("Installing dependencies...")
import functools
import os
import gin
import tensorflow_gcs_config
from google.colab import auth
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
from contextlib import contextmanager
import logging as py_logging
import t5

Installing dependencies...


In [3]:
TOKENIZER_DIR = "gs://bucket_context" #@param { type: "string" }
if not TOKENIZER_DIR or TOKENIZER_DIR == "gs://": 
  raise ValueError("You must enter a TOKENIZER_DIR.")

print("Setting up GCS access...")
os.environ['USE_AUTH_EPHEM'] = '0'
from google.colab import auth
auth.authenticate_user()

# Set credentials for GCS reading/writing from Colab and TPU.
TPU_TOPOLOGY = "2x2"
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
tf.config.experimental_connect_to_host(TPU_ADDRESS)
tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()


#LOGGING
tf.get_logger().propagate = False
py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Setting up GCS access...
Running on TPU: grpc://10.0.196.170:8470
Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


# Load Vocabulary

In [4]:
vocab_model_path = 'gs://bucket_context/eighth_experiment/code.model'
vocab_path = 'gs://bucket_context/eighth_experiment/code.vocab'
print(vocab_model_path)
print(vocab_path)

gs://bucket_context/eighth_experiment/code.model
gs://bucket_context/eighth_experiment/code.vocab


In [5]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

num_special_mask_tokens = 100 #@param {type: "integer"}

def load_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, num_special_mask_tokens)

In [6]:
config="baseline"

# Prepare Dataset for T5

In [7]:
train_path = 'gs://bucket_context/eighth_experiment/ft_{}/train.tsv'.format(config) #@param { type: "string" }
eval_path = 'gs://bucket_context/eighth_experiment/ft_{}/eval.tsv'.format(config) #@param { type: "string" }
test_path = 'gs://bucket_context/eighth_experiment/ft_{}/test.tsv'.format(config) #@param { type: "string" }
finetune_datasets_paths = {
    "train":      train_path,
    "validation": eval_path
}

# Useful when multi-task training 
# num_input_examples = dict(train=106382, validation=12020) 

In [8]:
def load_dataset(split, shuffle_files=True):
  """
  Function to load .tsv dataset as a tf.data.Dataset in TensorFlow
  """
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(finetune_datasets_paths[split])
  ds = ds.map(functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                          field_delim="\t", use_quote_delim=False)
                          , 
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

### A few examples

In [9]:
print("A few raw validation examples...")
for ex in tfds.as_numpy(load_dataset("validation").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'@Override<nl><extra_id_0><nl>if (!softLimitsEnabled) {<nl>return;<nl>}<nl>double xOffset = workCoord.x - machineCoord.x;<nl>double yOffset = workCoord.y - machineCoord.y;<nl>double zOffset = workCoord.z - machineCoord.z;<nl>Position bottomLeft = new Position(-maxPosition.getX() + xOffset, -maxPosition.getY() + yOffset, -maxPosition.getZ() + zOffset);<nl>Position topRight = new Position(xOffset, yOffset, zOffset);<nl>GL2 gl = drawable.getGL().getGL2();<nl>gl.glPushMatrix();<nl>drawBase(gl, bottomLeft, topRight);<nl>drawSides(gl, bottomLeft, topRight);<nl>drawAxisLines(gl, bottomLeft, topRight);<nl>gl.glPopMatrix();<nl>}', 'output': b'public void draw(GLAutoDrawable drawable, boolean idle, Position machineCoord, Position workCoord, Position focusMin, Position focusMax, double scaleFactor, Position mouseWorldCoordinates, Position rotation) {'}
{'input': b'@Override<nl>public void draw(GLAutoDrawable drawable, boolean idle, Position machineCoord

# Dataset Prepocessing 

In [10]:
from tensorflow_datasets.core.utils.type_utils import Shape

def preprocessing(ds):
  """
  Preprocess function to convert the tf.data.Dataset into a text-to-text format,
  with both inputs and targets fields.
  Param: tf.data.Dataset
  Return: text-to-text format
  """
  prefix = '' # no prefix for pretraining
  def to_inputs_and_targets(ex):
    x_input = tf.strings.strip(prefix + ex['input'])
    y_label = tf.strings.strip(ex['output']) 
    inputs = tf.strings.join([x_input], separator=' ')
    class_label = tf.strings.join([y_label], separator=' ')
    return {'inputs': inputs, 'targets': class_label}
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

### A few examples

In [11]:
print("A few preprocessed train examples...")
sample = tfds.as_numpy(preprocessing(load_dataset("train").take(5)))
for ex in sample:
  print(ex)

A few preprocessed train examples...
{'inputs': b'<extra_id_0><nl>return MapHandlerRegistration.addHandler(this, MapEventType.SHADOW_CHANGED, handler,<nl>new ShadowChangeEventFormatter());<nl>}', 'targets': b'public final HandlerRegistration addShadowChangeHandler(ShadowChangeMapHandler handler) {'}
{'inputs': b'public static com.oracle.bmc.http.internal.WrappedInvocationBuilder fromRequest(<nl>com.oracle.bmc.http.internal.RestClient client,<nl>com.oracle.bmc.core.requests.UpdateVolumeGroupBackupRequest request) {<nl>Validate.notNull(request, "request instance is required");<nl>Validate.notBlank(<nl>request.getVolumeGroupBackupId(), "volumeGroupBackupId must not be blank");<nl>Validate.notNull(<nl>request.getUpdateVolumeGroupBackupDetails(),<nl>"updateVolumeGroupBackupDetails is required");<nl>com.oracle.bmc.http.internal.WrappedWebTarget target =<nl>client.getBaseTarget()<nl>.path("/20160918")<nl>.path("volumeGroupBackups")<nl>.path(<nl>com.oracle.bmc.util.internal.HttpUtils.encodePat

# Creating Task and Mixture

In [12]:
DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=load_vocabulary(), add_eos=True, required=False),
    "targets": Feature(
        vocabulary=load_vocabulary(), add_eos=True)
    }

TASK_NAME = "ft" #@param{ type : "string"}

# TASK
t5.data.TaskRegistry.remove(TASK_NAME)
t5.data.TaskRegistry.add(
    TASK_NAME,
    # Function which returns a tf.data.Dataset
    dataset_fn=load_dataset,
    splits=["train","validation"],
    # List of functions that preprocess the input tf.data.Dataset
    text_preprocessor=[preprocessing],
    # Accuracy is used as evaluation metric
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, helps for mixing and auto-caching
    # num_input_examples=num_input_examples,
    output_features = DEFAULT_OUTPUT_FEATURES
)

MIXTURE_NAME = "task" #@param{ type : "string"}

# MIXTURE
t5.data.MixtureRegistry.remove(MIXTURE_NAME)
t5.data.MixtureRegistry.add(
    MIXTURE_NAME,
    # List of tasks
    [TASK_NAME],
    default_rate=1.0
)


<seqio.dataset_providers.Mixture at 0x7fc4d6f54550>

### A few examples

In [13]:
finetuning_task = t5.data.TaskRegistry.get(TASK_NAME)
ds = finetuning_task.get_dataset(split="train", sequence_length={"inputs": 1024, "targets": 1024})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

A few preprocessed training examples...
{'inputs_pretokenized': b'public void insert(int index, int length) {<nl>ensureCapacity(pos + length);<nl>if (pos > index) {<nl><extra_id_0><nl>}<nl>pos += length;<nl>}', 'inputs': array([   29,    48,  1011,   325,  2775,   128,   101,    38,    82,
         349,    13,     7, 26652,  2290,   325,  1962,    32,    82,
         908,     7,  1484,     8,  1962,    21,   128,   349,    13,
           7,  2029, 22479,    18,   421,  1678,  1047,     7,  2977,
           7,  1962,   306,    82,   785,     7,  2977,     1],
      dtype=int32), 'targets_pretokenized': b'System.arraycopy(data, index, data, index + length, pos - index);', 'targets': array([ 210,   35, 3687, 7701,  325, 1144,  101,  128,  101,  116,  101,
        128,   32,   82,  101,  472,   87,  128,  908,    1], dtype=int32)}
       23281, 19672,   131,  2029,   601,  1047,   205,  4528,   667,
          88,   325,  1360,   924,    99,  2029,   601,  1047,    73,
         101,  7088, 

# Creating Model

In [14]:
from t5 import models

from tensorflow.keras.optimizers.schedules import PolynomialDecay

starter_learning_rate = 0.01
end_learning_rate = 1e-06
decay_steps = 10000

learning_rate_fn = PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=0.5)

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string ('f', '', 'kernel')

#See https://github.com/google-research/text-to-text-transfer-transformer if you want to scale up the model
MODEL_SIZE = "base"  

MODEL_DIR = 'gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model'

PRETRAINED_DIR='gs://bucket_context/eighth_experiment/pt_model'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 64, 50),
    "base": (2, 32, 30),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 1024, "targets": 1024},
    learning_rate_schedule = learning_rate_fn,
    save_checkpoints_steps=10000,
    keep_checkpoint_max=keep_checkpoint_max
)

In [15]:
PATH_GIN_FILE = 'gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/operative_config.gin'
import gin

with gin.unlock_config():
    gin.parse_config_file(PATH_GIN_FILE)
    #RUN FINE-TUNING
    FINETUNE_STEPS = 30000
    model.finetune(
        mixture_or_task_name="task",
        pretrained_model_dir=PRETRAINED_DIR,
        finetune_steps=FINETUNE_STEPS
    )

INFO:root:system_path_file_exists:gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/operative_config.gin
ERROR:root:Path not found: gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/operative_config.gin
INFO:root:system_path_file_exists:gs://bucket_context/eighth_experiment/pt_model/operative_config.gin
ERROR:root:Path not found: gs://bucket_context/eighth_experiment/pt_model/operative_config.gin


INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.0.196.170:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.0.196.170:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.0.196.170:8470', '_evaluation_master': 'grp

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = 512
model.eval(
    mixture_or_task_name="task",
    # mixture_or_task_name="all_tasks",
    checkpoint_steps=-1 #evaluate only last checkpoint
)

In [16]:
# we used model.predict function (setting beam_size)

vocabulary_predict=load_vocabulary()

model.predict(input_file='gs://bucket_context/eighth_experiment/HP_TUNING/constant/model/validation_eval/ft_inputs', output_file='gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model/predictions.txt',
              checkpoint_steps=-1, beam_size=1, temperature=0.0, keep_top_k=-1, vocabulary=vocabulary_predict)

INFO:root:system_path_file_exists:gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model/operative_config.gin
ERROR:root:Path not found: gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model/operative_config.gin


INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket_context/eighth_experiment/HP_TUNING/polynomial/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.0.196.170:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.0.196.170:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.0.196.170:8470', '_evaluation_master': 'grp