## Code for automatically click connect (run on console cmd+alt+i)
```
function KeepClicking(){
console.log("Clicking");
document.querySelector("colab-connect-button").click()
}
setInterval(KeepClicking,60000)
```



In [None]:
print("Installing dependencies...")
model_dir_name = "models" # TOTTO finetuning model
#model_dir_name = "noTottoModels"
%tensorflow_version 2.x
!pip install -q t5

# copy t5 model
# gsutil cp -r "gs://t5-data/pretrained_models/3B/" "gs://totto-explain-claim/models/"

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5
import t5.models

BASE_DIR = "gs://pythia_totto" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, model_dir_name)
print("MODELS_DIR:", MODELS_DIR)
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v2-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.enable_eager_execution()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
print("DATA_DIR:", DATA_DIR)
print("MODELS_DIR:", MODELS_DIR)

In [None]:
!pip install datasets

In [None]:
print(t5.data.MixtureRegistry.names())

In [None]:
def toList(lst):
    return list(map(lambda el:[el], lst))

def toLower(lst):
    return list(map(lambda x:x.lower(), lst))

def toTokens(lst):
    return list(map(str.split, lst))

#l = ['Uno Due Tre', 'Uno Due', 'UNO']
#print(toList(l))
#print(toLower(l))
#print(toTokens(toLower(l)))
#print(toList(toTokens(toLower(l))))

In [None]:
def compare(str1, str2): 
  return str1.lower().strip() == str2.lower().strip()

def evaluate(predicted, actual):
  tp = 0
  for i in range(0, len(predicted)):
    pred = predicted[i]
    act = actual[i]
    if (compare(pred, act)):
      tp += 1
  return (tp / len(predicted))

In [None]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
totto_dataset = load_dataset('totto')


In [None]:
class Header:
  def __init__(self, name):
    self.name = name

  def __str__(self):
    return "<col_header> " + self.name + " </col_header>"


class Cell:
  def __init__(self, value, header):
    self.header = header
    self.value = value

  def __str__(self):
    if (self.header is None):
      return "<cell> " + self.value +" </cell>"
    else:
      return "<cell> " + self.value + " " + str(self.header) + " </cell>"


class Table:
  def __init__(self):
    self.rows = []
  
  def initTable(self, data):
    headersList = []
    currentHeader = []
    rowHeader = -1
    rowIndex = 0
    for dataRow in data:
      currentRow = []
      #print("****ROW: ", dataRow)
      #print("****ROW_INDEX:", rowIndex)
      #print("****CURRENT HEADER:", len(currentHeader))
      isHeaderRow = self.__containsHeader(dataRow)
      if (isHeaderRow):
        #print("****FOUND HEADER")
        header = self.__readHeader(dataRow)
        rowHeader = rowIndex
        currentHeader = header
        headersList.append(currentHeader)
        currentRow = self.__readRow(dataRow)
      else:
        currentRow = self.__readRow(dataRow, currentHeader, headersList)
      rowIndex += 1
      self.rows.append(currentRow)

  def __readHeader(self, row):
    header = []
    for cell in row:
      rowSpan = cell['column_span']
      value = cell['value']
      for i in range(rowSpan):
        header.append(Header(value))
    return header

  def __readRow(self, row, currentHeader = None, headersList = None):
    tableRow = []
    colIndex = 0
    matchedHeader = self.__match(row, currentHeader, headersList)
    #print("****MATCHED_HEADER:", matchedHeader)
    for cell in row:
      rowSpan = cell['column_span']
      value = cell['value']
      for i in range(rowSpan):
        if ((matchedHeader is None) or (len(matchedHeader) == 0)):
          tableRow.append(Cell(value, None))
        else:
          tableRow.append(Cell(value, matchedHeader[colIndex]))
      colIndex += 1
    return tableRow

  def __match(self, row, currentHeader, headersList):
    if ((currentHeader is None) and (headersList is None)):
      return None
    if ((currentHeader is not None) and (len(row) == len(currentHeader))):
      return currentHeader
    if ((headersList is not None) and (len(headersList) > 0)):
      for header in headersList:
        if (len(header) == len(row)):
          return header
    return None

  def __containsHeader(self, cellData):
    for cell in cellData:
      if (bool(cell['is_header'])):
        return True
    return False

  def cellsToText(self, cellsList):
    text = ""
    for cell in cellsList:
      cellData = self.rows[cell[0]][cell[1]]
      text += str(cellData) + " "
    return "<table> " + text + "</table>"

  def __str__(self):
    textData = ""
    for row in self.rows:
      textData += " ".join([str(cell) for cell in row])+"\n"
    return textData


In [None]:
class TottoTable:

  def __init__(self):
    self.headers = []
    self.rows = []

  def initTable(self, data):
    for rowData in data:
      for cellData in rowData:
        if (bool(cellData['is_header'])):
           rowSpan = cellData['column_span']
           for i in range(rowSpan):
             self.headers.append(cellData['value'])
        else:
          rowSpan = cellData['column_span']
          row = []
          for i in range(rowSpan):
            self.headers.append(cellData['value'])
            row.append(cellData['value'])
          self.rows.append(row)


  def setHeaders(self, headersList):
    for headerCell in headersList:
      rowSpan = headerCell['column_span']
      for i in range(1, rowSpan):
        self.headers.append(headerCell['value'])

  def addRow(self, rowList):
    row = []
    for rowCell in rowList:
      row.append(rowCell['value'])
    self.rows.append(row)
  
  def getHeader(self, pos):
    return self.headers[pos]

  def getCellValue(self, row, column):
    return self.rows[row][column]

  def __str__(self):
    headerString = "\t".join(self.headers) + "\n"
    dataString = ""
    for row in self.rows:
      dataString += "\t".join(row) + "\n"
    return headerString + dataString

  def cellsToText(self, cellsList):
    text = ""
    for cell in cellsList:
      headerValue = "<col_header> "+ self.headers[cell[1]] + " </col_header>"
      cellValue ="<cell> "+ self.getCellValue(cell[0]-1, cell[1])+ " " + headerValue + " </cell>"
      text += cellValue+ " "
    return "<table> " + text + "</table>"


In [None]:
def dataset_to_tsv(dataset, out_fname):
  count = 0
  with tf.io.gfile.GFile(out_fname, "w") as outfile:
    for example in dataset:
      table_cells_text = " ".join((map(lambda cell: str(cell), example['table'])))
      highlighted_cells_text = " ".join((map(lambda cell: str(example['table'][cell[0]][cell[1]]), example['highlighted_cells'])))
      text_to_append = ["<page_title>", example['table_page_title'], "/<page_title>", "<section_title>", example['table_section_title'], "</section_title>", "<table>", table_cells_text, "</table>", "<highlighted_cells>", highlighted_cells_text, "</highlighted_cells>"]
      x = " ".join(text_to_append)
      y = example['sentence_annotations']['final_sentence'][0]
      outfile.write("%s\t%s\n" % (x, y))
      count+=1
      tf.logging.log_every_n(tf.logging.INFO, "Wrote %d examples to %s." % (count, out_fname), 1000)
  return count

In [None]:
def dataset_to_tsv_2(dataset, out_fname):
  count = 0
  with tf.io.gfile.GFile(out_fname, "w") as outfile:
    for example in dataset:
      #print("Example N:", count)
      table = example['table']
      t = Table();
      t.initTable(table)
      highlighted_cells = example['highlighted_cells']
      highlightet_cells_text = t.cellsToText(highlighted_cells)
      text_to_append = ["<page_title>", example['table_page_title'], "</page_title>", "<section_title>", example['table_section_title'], "</section_title>", highlightet_cells_text]
      x = " ".join(text_to_append)
      y = example['sentence_annotations']['final_sentence'][0]
      #print(x)
      #print(y)
      outfile.write("%s\t%s\n" % (x, y))
      count+=1
      tf.logging.log_every_n(tf.logging.INFO, "Wrote %d examples to %s." % (count, out_fname), 1000)
  return count

In [None]:
## DATA FOR TOTTO FINE TUNING - Train with 25000 Steps
import json
counts_path = os.path.join(DATA_DIR, "counts.json")
tsv_path = {
    "train": os.path.join(DATA_DIR, "train.tsv"),
    "validation": os.path.join(DATA_DIR, "validation.tsv"),
    "test": os.path.join(DATA_DIR, "test.tsv")
}

if tf.io.gfile.exists(counts_path):
  # Used cached data and counts.
  tf.logging.info("Loading NQ from cache.")
  num_nq_examples = json.load(tf.io.gfile.GFile(counts_path))
else:
  # Create TSVs and get counts.
  tf.logging.info("Generating NQ TSVs.")
  num_nq_examples = {}
  for split in ["train", "validation", "test"]:
    num_nq_examples[split] = dataset_to_tsv_2(totto_dataset[split], tsv_path[split])
  json.dump(num_nq_examples, tf.io.gfile.GFile(counts_path, "w"))

In [None]:
## DATA FOR AMBTOTTO FINE TUNING - To Execute after the first training till 30000 steps
import json
print(DATA_DIR)
#counts_path = os.path.join(DATA_DIR, "counts.json")
tsv_path = {
    "train": os.path.join(DATA_DIR, "totto_train.tsv"),
    "validation": os.path.join(DATA_DIR, "totto_ambiguities.tsv"),
}

In [None]:
def to_dataset_ts(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(tsv_path[split])
  # Split each "<question>\t<answer>" example into (question, answer) tuple.
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  # Map each tuple to a {"totto": ... "explain_table": ...} dict.
  ds = ds.map(lambda *ex: dict(zip(["totto", "explain_table"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(to_dataset_ts("train").take(5)):
  print(ex)

In [None]:
def text_preprocessor(ds):
  def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    #text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text

  def to_inputs_and_targets(ex):
    """Map {"question": ..., "answer": ...}->{"inputs": ..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["totto table: ", normalize_text(ex["totto"])]),
        "targets": normalize_text(ex["explain_table"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
t5.data.TaskRegistry.remove("totto_context")
t5.data.TaskRegistry.add(
    "totto_context",
    # Specify the task type.
    t5.data.Task,
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=to_dataset_ts,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[text_preprocessor],
    # Lowercase targets before computing metrics.
    postprocess_fn=t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, but helps for mixing and auto-caching.
    #num_input_examples=num_nq_examples
)

In [None]:
nq_task = t5.data.TaskRegistry.get("totto_context")
#ds = nq_task.get_dataset(split="validation", sequence_length={"inputs": 128, "targets": 32})
ds = nq_task.get_dataset(split="validation", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

In [None]:
t5.data.MixtureRegistry.remove("totto_all")
t5.data.MixtureRegistry.remove("all_mix")
t5.data.MixtureRegistry.add(
    "all_mix",
    ["totto_context"],
     default_rate=1.0
)

In [None]:
MODEL_SIZE = "3B" #@param["small", "base", "large", "3B", "11B"]
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)
MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)

if ON_CLOUD and MODEL_SIZE == "3B":
  tf.logging.warning(
      "The `3B` model is too large to use with the 5GB GCS free tier. "
      "Make sure you have at least 25GB on GCS before continuing."
  )
elif ON_CLOUD and MODEL_SIZE == "11B":
  raise ValueError(
      "The `11B` parameter is too large to fine-tune on the `v2-8` TPU "
      "provided by Colab. Please comment out this Error if you're running "
      "on a larger TPU."
  )

# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 128, "targets": 32},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=100,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

In [None]:
if ON_CLOUD:
  import datetime
  %reload_ext tensorboard
  !rm -rf ./logs/
  log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#%tensorboard --logdir="$MODEL_DIR" --port=0

In [None]:
FINETUNE_STEPS =  30000 #@param {type: "integer"}
model._save_checkpoints_steps=1000
model._keep_checkpoint_max = 50

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    #pretrained_model_dir=MODEL_DIR,
    finetune_steps=FINETUNE_STEPS,
    #pretrained_checkpoint_step = 12000
)

MSG = str(FINETUNE_STEPS) + " done"
push = pb.push_note("TOTTO", MSG)

In [None]:
tf.train.latest_checkpoint(
    model._model_dir, latest_filename=None
)

In [None]:
print("Model Batch Size: ", model.batch_size)
print("Train Batch Size: ", train_batch_size)

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
#model.batch_size = train_batch_size * 4
model.batch_size = train_batch_size * 1
#model.eval(
#    mixture_or_task_name="all_mix",
#    checkpoint_steps="all"
#)

In [None]:
!pip install fsspec
!pip install gcsfs
import pandas as pd
from tensorflow.python.lib.io import file_io
with file_io.FileIO('gs://pythia_totto/data/totto_ambiguities.tsv', 'r') as f:
  df = pd.read_csv(f, sep='\t', header=None)
print(df.shape)

In [None]:
x_val = df[0].values
y_val = df[1].values

In [None]:
now = time.time()
# Write out the supplied questions to text files.
predict_inputs_path = os.path.join(MODEL_DIR, "predict_inputs_%d.txt" % now)
predict_outputs_path = os.path.join(MODEL_DIR, "predict_outputs_%d.txt" % now)
print(predict_inputs_path)
print(predict_outputs_path)
# Manually apply preprocessing by prepending "triviaqa question:".
with tf.io.gfile.GFile(predict_inputs_path, "w") as f:
  for q in x_val:
    f.write("totto table: %s\n" % q.lower())

# Ignore any logging so that we only see the model's answers to the questions.
with tf_verbosity_level('ERROR'):
  model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
  model.predict(
      input_file=predict_inputs_path,
      output_file=predict_outputs_path,
      # Select the most probable output token at each step.
      temperature=0,
  )

# The output filename will have the checkpoint appended so we glob to get 
# the latest.
prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])
predicted = []
with tf.io.gfile.GFile(prediction_files[-1]) as f:
  for x, y, p in zip(x_val, y_val, f):
    if x:
      predicted.append(p)
      #print("Input     : " + x)
      #print("Prediction: " + p)
      #print("Actual    : " + y)
      #print()

In [None]:
print(len(predicted))
print(len(y_val))

In [None]:
for pred, actual in zip(predicted, y_val):
  print(pred, actual)

In [None]:
def toList(lst):
    return list(map(lambda el:[el], lst))

def toLower(lst):
    return list(map(lambda x:x.lower(), lst))

def toTokens(lst):
    return list(map(str.split, lst))

l = ['Uno Due Tre', 'Uno Due', 'UNO']
print(toList(l))
print(toLower(l))
print(toTokens(toLower(l)))
print(toList(toTokens(toLower(l))))

In [None]:
from datasets import load_metric
sacrebleu_metric = load_metric('sacrebleu')
#print(sacrebleu_metric)
rouge_metric = load_metric('rouge')
#print(rouge_metric)
bleu_metric = load_metric('bleu')
results_sacrebleu = sacrebleu_metric.compute(predictions=predicted, references=toList(y_val), lowercase=True)
#print(results_sacrebleu)
results_rouge = rouge_metric.compute(predictions=toLower(predicted), references=toLower(y_val))
#print(results_rouge)
#results_bleu = bleu_metric.compute(predictions=toTokens(toLower(predicted)), references=toList((toTokens(toLower(y_val)))))
#print(results_bleu)

print("BLEU score: ", results_sacrebleu['score'])
print("ROUGE score: ", results_rouge['rouge1'].low.fmeasure)