# Table data extraction with Tapas

The Tapas was introduced in this research paper https://arxiv.org/abs/2004.02349

You can either clone the repository(https://github.com/google-research/tapas) and start working or follow this notebook. 
In the Tapas Github there is already a notebook which details the implementation on Colab https://colab.research.google.com/github/google-research/tapas/blob/master/notebooks/sqa_predictions.ipynb#scrollTo=uI6zyIM20Kw4

Note: This notebook is for running the model in local with a custom dataframe of interest. Its the same implementation detailed in the colab notebook

### First, let's install the code from PyPI https://pypi.org/project/tapas-table-parsing/

In [None]:
pip install tapas-table-parsing

### lets import the packages

In [None]:
import tensorflow.compat.v1 as tf # make sure tensorflow version > 2.0, tensorflow~=2.2.0 will be ideal
import os 
import shutil
import csv
import pandas as pd
import IPython

tf.get_logger().setLevel('ERROR')

from tapas.utils import tf_example_utils
from tapas.protos import interaction_pb2
from tapas.utils import number_annotation_utils
from tapas.scripts import prediction_utils

before making the custom directories to store the files, make sure to specify the local directory of your interest

os.chdir(r'XXX\XXX')

I have downloaded and updated all required files to run this code in my repo, so download or clone my repo into your local

### Load checkpoint for prediction

Please note this is base sized model trained on SQA

In [None]:
os.makedirs('results/sqa/tf_examples', exist_ok=True)
os.makedirs('results/sqa/model', exist_ok=True)
with open('results/sqa/model/checkpoint', 'w') as f:
  f.write('model_checkpoint_path: "model.ckpt-0"')
for suffix in ['.data-00000-of-00001', '.index', '.meta']:
  shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}', f'results/sqa/model/model.ckpt-0{suffix}')

In [None]:
max_seq_length = 512
vocab_file = "tapas_sqa_base/vocab.txt"
config = tf_example_utils.ClassifierConversionConfig(
    vocab_file=vocab_file,
    max_seq_length=max_seq_length,
    max_column_id=max_seq_length,
    max_row_id=max_seq_length,
    strip_column_names=False,
    add_aggregation_candidates=False,
)
converter = tf_example_utils.ToClassifierTensorflowExample(config)

### Please note I have removed some unwanted lines and altered the predict function to read data from pandas dataframe and predict the results

In [None]:
def convert_interactions_to_examples(tables_and_queries):
  """Calls Tapas converter to convert interaction to example."""
  for idx, (table, queries) in enumerate(tables_and_queries):
    interaction = interaction_pb2.Interaction()
    for position, query in enumerate(queries):
      question = interaction.questions.add()
      question.original_text = query
      question.id = f"{idx}-0_{position}"
    for header in table[0]:
      interaction.table.columns.add().text = header
    for line in table[1:]:
      row = interaction.table.rows.add()
      for cell in line:
        row.cells.add().text = cell
    number_annotation_utils.add_numeric_values(interaction)
    for i in range(len(interaction.questions)):
      try:
        yield converter.convert(interaction, i)
      except ValueError as e:
        print(f"Can't convert interaction: {interaction.id} error: {e}")
        
def write_tf_example(filename, examples):
  with tf.io.TFRecordWriter(filename) as writer:
    for example in examples:
      writer.write(example.SerializeToString())

def predict(table_data, queries):

  table_data = table_data.astype(str)
  
  table1 = [table_data.columns.tolist()]
  table1.extend(table_data.to_numpy().tolist())

  examples = convert_interactions_to_examples([(table1, queries)])
  write_tf_example("results/sqa/tf_examples/test.tfrecord", examples)
  write_tf_example("results/sqa/tf_examples/random-split-1-dev.tfrecord", [])
  
  results_path = "results/sqa/model/test_sequence.tsv"
  all_coordinates = []
  df = table_data

  try:
      with open(results_path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
          coordinates = prediction_utils.parse_coordinates(row["answer_coordinates"])
          all_coordinates.append(coordinates)
          answers = ', '.join([table1[row + 1][col] for row, col in coordinates])
          position = int(row['position'])
          print(">", queries[position])
          print(answers)
  except:
      print()
  return all_coordinates


   ### Now its time to load your data and ask questions, Please note the sequence length of each row entry

In [None]:
df = pd.read_excel('test_df.xlsx')

result = predict(df, ["what were the drivers names?",
      "of these, which points did patrick carpentier and bruno junqueira score?",
      "how many points for Paul Tracy"])
