# Set Up

In [None]:
from IPython.display import clear_output 
!pip install tensorflow==2.9 t5 tensorflow-text==2.9
#!pip install -q t5 tensorflow-text==2.4.3
#!pip install -q tensorflow-text==2.8.0rc0
#!pip install -U tensorflow-gcs-config==2.9.1
clear_output()

In [None]:
print("Installing dependencies...")
import functools
import os
import gin
import tensorflow_gcs_config
from google.colab import auth
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
from contextlib import contextmanager
import logging as py_logging
import t5

Installing dependencies...


In [None]:
TOKENIZER_DIR = "<bucket>" #@param { type: "string" }
if not TOKENIZER_DIR or TOKENIZER_DIR == "gs://": 
  raise ValueError("You must enter a TOKENIZER_DIR.")

print("Setting up GCS access...")
os.environ['USE_AUTH_EPHEM'] = '0'
from google.colab import auth
auth.authenticate_user()

# Set credentials for GCS reading/writing from Colab and TPU.
TPU_TOPOLOGY = "2x2"
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
#tf.config.experimental_connect_to_host(TPU_ADDRESS)
tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()


#LOGGING
tf.get_logger().propagate = False
py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Setting up GCS access...


Instructions for updating:
non-resource variables are not supported in the long term


Running on TPU: grpc://10.49.36.146:8470


# Load Vocabulary

In [None]:
vocab_model_path = 'code.model file'
vocab_path = 'code.vocab file'
print(vocab_model_path)
print(vocab_path)

gs://bucket_context/eighth_experiment/code.model
gs://bucket_context/eighth_experiment/code.vocab


In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

num_special_mask_tokens = 100 #@param {type: "integer"}

def load_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, num_special_mask_tokens)

In [None]:
# change config file based on the finetuning context you want to perform
config="call"

# Prepare Dataset for T5

In [None]:
# save each dataset in a folder named ft_ followed by the current context
train_path = 'gs://bucket_context/eighth_experiment/ft_{}/train.tsv'.format(config) #@param { type: "string" }
eval_path = 'gs://bucket_context/eighth_experiment/ft_{}/eval.tsv'.format(config) #@param { type: "string" }
test_path = 'gs://bucket_context/eighth_experiment/ft_{}/test.tsv'.format(config) #@param { type: "string" }
finetune_datasets_paths = {
    "train":      train_path,
    "validation": eval_path
}

# Useful when multi-task training 
# num_input_examples = dict(train=106382, validation=12020) 

In [None]:
def load_dataset(split, shuffle_files=True):
  """
  Function to load .tsv dataset as a tf.data.Dataset in TensorFlow
  """
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(finetune_datasets_paths[split])
  ds = ds.map(functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                          field_delim="\t", use_quote_delim=False)
                          , 
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

### A few examples

In [None]:
print("A few raw validation examples...")
for ex in tfds.as_numpy(load_dataset("train").take(5)):
  print(ex)

A few raw validation examples...
{'input': b'<extra_id_0><nl>return MapHandlerRegistration.addHandler(this, MapEventType.SHADOW_CHANGED, handler,<nl>new ShadowChangeEventFormatter());<nl>} <sep> public final HandlerRegistration addAnimationChangeHandler(AnimationChangeMapHandler handler) {<nl>return MapHandlerRegistration.addHandler(this, MapEventType.ANIMATION_CHANGED, handler,<nl>new AnimationChangeEventFormatter());<nl>}', 'output': b'public final HandlerRegistration addShadowChangeHandler(ShadowChangeMapHandler handler) {'}
{'input': b'public static com.oracle.bmc.http.internal.WrappedInvocationBuilder fromRequest(<nl>com.oracle.bmc.http.internal.RestClient client,<nl>com.oracle.bmc.core.requests.UpdateVolumeGroupBackupRequest request) {<nl>Validate.notNull(request, "request instance is required");<nl>Validate.notBlank(<nl>request.getVolumeGroupBackupId(), "volumeGroupBackupId must not be blank");<nl>Validate.notNull(<nl>request.getUpdateVolumeGroupBackupDetails(),<nl>"updateVolume

# Dataset Prepocessing 

In [None]:
from tensorflow_datasets.core.utils.type_utils import Shape

def preprocessing(ds):
  """
  Preprocess function to convert the tf.data.Dataset into a text-to-text format,
  with both inputs and targets fields.
  Param: tf.data.Dataset
  Return: text-to-text format
  """
  prefix = '' # no prefix for pretraining
  def to_inputs_and_targets(ex):
    x_input = tf.strings.strip(prefix + ex['input'])
    y_label = tf.strings.strip(ex['output']) 
    inputs = tf.strings.join([x_input], separator=' ')
    class_label = tf.strings.join([y_label], separator=' ')
    return {'inputs': inputs, 'targets': class_label}
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

### A few examples

In [None]:
print("A few preprocessed train examples...")
sample = tfds.as_numpy(preprocessing(load_dataset("train").take(5)))
for ex in sample:
  print(ex)

A few preprocessed train examples...
{'inputs': b'<extra_id_0><nl>return MapHandlerRegistration.addHandler(this, MapEventType.SHADOW_CHANGED, handler,<nl>new ShadowChangeEventFormatter());<nl>} <sep> public final HandlerRegistration addAnimationChangeHandler(AnimationChangeMapHandler handler) {<nl>return MapHandlerRegistration.addHandler(this, MapEventType.ANIMATION_CHANGED, handler,<nl>new AnimationChangeEventFormatter());<nl>}', 'targets': b'public final HandlerRegistration addShadowChangeHandler(ShadowChangeMapHandler handler) {'}
{'inputs': b'public static com.oracle.bmc.http.internal.WrappedInvocationBuilder fromRequest(<nl>com.oracle.bmc.http.internal.RestClient client,<nl>com.oracle.bmc.core.requests.UpdateVolumeGroupBackupRequest request) {<nl>Validate.notNull(request, "request instance is required");<nl>Validate.notBlank(<nl>request.getVolumeGroupBackupId(), "volumeGroupBackupId must not be blank");<nl>Validate.notNull(<nl>request.getUpdateVolumeGroupBackupDetails(),<nl>"updat

# Creating Task and Mixture

In [None]:
DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=load_vocabulary(), add_eos=True, required=False),
    "targets": Feature(
        vocabulary=load_vocabulary(), add_eos=True)
    }

TASK_NAME = "ft" #@param{ type : "string"}

# TASK
t5.data.TaskRegistry.remove(TASK_NAME)
t5.data.TaskRegistry.add(
    TASK_NAME,
    # Function which returns a tf.data.Dataset
    dataset_fn=load_dataset,
    splits=["train","validation"],
    # List of functions that preprocess the input tf.data.Dataset
    text_preprocessor=[preprocessing],
    # Accuracy is used as evaluation metric
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, helps for mixing and auto-caching
    # num_input_examples=num_input_examples,
    output_features = DEFAULT_OUTPUT_FEATURES
)

MIXTURE_NAME = "task" #@param{ type : "string"}

# MIXTURE
t5.data.MixtureRegistry.remove(MIXTURE_NAME)
t5.data.MixtureRegistry.add(
    MIXTURE_NAME,
    # List of tasks
    [TASK_NAME],
    default_rate=1.0
)


<seqio.dataset_providers.Mixture at 0x7ff7efbafa00>

### A few examples

In [None]:
finetuning_task = t5.data.TaskRegistry.get(TASK_NAME)
ds = finetuning_task.get_dataset(split="train", sequence_length={"inputs": 1024, "targets": 1024})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)



A few preprocessed training examples...
           7,  4943,    38, 29721,   842,    56,  8837,    13,     7,
        8428,  9639,    36,   770,   101,   209, 20449,   430,   805,
        1089,   100,    25, 16047,     7,  8428,   370,   156,   365,
         430,   805,  8626,   109,   262,   370,     7,  8428,   113,
         262,   370,   286,   926,    89,  3745,    37,    25,   222,
           7,  8428,   370,     7,  2029, 22479,    18,   421,  1678,
        1047,     7,  8428, 14544,   209,   370,   231,  2837,   101,
        1827,   286,    89,  2466,     7,   884,   347,    35,  4359,
         436,   325,  2189,    35,  1797,    35,   438,   320,   212,
        2715,   438,   436,  5555,    36,    35,  2982,   908,     7,
        8428,   113,   359,    47,  1549, 11538,     7,  1484,     8,
        3514,   347,    35,  1022,   320, 17293, 14965,   325,  2189,
          35,  2982,   101,    36,    35,  1797,  5445,    13,     7,
        2930,  1897,  3514,  2388,     7,  2977,  

# Creating Model

In [None]:
from t5 import models

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string ('f', '', 'kernel')

#See https://github.com/google-research/text-to-text-transfer-transformer if you want to scale up the model
MODEL_SIZE = "base"  

MODEL_DIR = 'gs://bucket_context/eighth_experiment/ft_final_model_{}'.format(config)

PRETRAINED_DIR='gs://bucket_context/eighth_experiment/pt_model'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 64, 50),
    "base": (2, 32, 100),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 1024, "targets": 1024},
    learning_rate_schedule = 0.001,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max
)

In [None]:
PATH_GIN_FILE = 'operative_config_constant.gin file'
import gin

with gin.unlock_config():
    gin.parse_config_file(PATH_GIN_FILE)
    #RUN FINE-TUNING
    FINETUNE_STEPS = 160000
    model.finetune(
        mixture_or_task_name="task",
        pretrained_model_dir=MODEL_DIR,
        finetune_steps=FINETUNE_STEPS
    )

INFO:root:system_path_file_exists:gs://bucket_context/eighth_experiment/ft_config/operative_config_constant.gin
ERROR:root:Path not found: gs://bucket_context/eighth_experiment/ft_config/operative_config_constant.gin
INFO:root:system_path_file_exists:gs://bucket_context/eighth_experiment/ft_final_model_most_similar_crystalbleu/operative_config.gin
ERROR:root:Path not found: gs://bucket_context/eighth_experiment/ft_final_model_most_similar_crystalbleu/operative_config.gin
From /usr/local/lib/python3.8/dist-packages/tensorflow/python/training/training_util.py:396: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
From /usr/local/lib/python3.8/dist-packages/seqio/dataset_providers.py:1537: sample_from_datasets_v2 (from tensorflow.python.data.experimental.ops.interleave_o