We need to format our data into SQA format and save into a csv/tsv for the finetuning which needs:

id: optional, id of the table-question pair, for bookkeeping purposes.

annotator: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.

position: integer indicating if the question is the first, second, third,… related to the table. Only required in case of conversational setup (SQA). You don’t need this column in case you’re going for WTQ/WikiSQL-supervised.

question: string

table_file: string, name of a csv file containing the tabular data
answer_coordinates: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is part of the answer)

answer_text: list of one or more strings (each string being a cell value that is part of the answer)
aggregation_label: index of the aggregation operator. Only required in case of strong supervision for aggregation (the WikiSQL-supervised case)

float_answer: the float answer to the question, if there is one (np.nan if there isn’t). Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL)

the tables refered to in the table_file area should be saved in a folder 

In [1]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig

In [2]:
# Load in all qa (train and dev)
semeval_train_qa = load_dataset("cardiffnlp/databench", name="semeval", split="train")
semeval_dev_qa = load_dataset("cardiffnlp/databench", name="semeval", split="dev")

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

In [3]:
# get the names of all of the train datasets
dfs_train = list(set(semeval_train_qa['dataset']))
dfs_train = sorted(dfs_train, key=lambda x: int(x.split('_')[0]))

# get the names of all of the dev datasets
dfs_dev = list(set(semeval_dev_qa['dataset']))
dfs_dev = sorted(dfs_dev, key=lambda x: int(x.split('_')[0]))

In [4]:
##### load in the forbes dataframe (pandas dataframes) #####


qa_dict = {} # dict to store all qa 
output_folder = os.getcwd()
for table in dfs_train:
    print('Processing: ', table)
    csv_file_path = os.path.join(output_folder, f"{table}.csv")
    
    # Load the qa.parquet dataframe and store it in the dictionary
    qa = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{table}/qa.parquet")
    qa_dict[table] = qa
        
    # Skip if the CSV file already exists
    if os.path.exists(csv_file_path):
        print(f"CSV for ID {table} already exists. Skipping...")
        continue

    try:
        # Load the all.parquet dataframe and save it as CSV
        df = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{table}/sample.parquet") # loading in the lite versions with only 20 rows
        df.to_csv(csv_file_path, index=False)  #### RERUN THIS WHEN I DO THE REAL THING
        print(f"Saved CSV for ID {table} at {csv_file_path}.")

        

    except Exception as e:
        print(f"Error processing ID {table}: {e}")

Processing:  001_Forbes
CSV for ID 001_Forbes already exists. Skipping...
Processing:  002_Titanic
CSV for ID 002_Titanic already exists. Skipping...
Processing:  003_Love
CSV for ID 003_Love already exists. Skipping...
Processing:  004_Taxi
CSV for ID 004_Taxi already exists. Skipping...
Processing:  005_NYC
CSV for ID 005_NYC already exists. Skipping...
Processing:  006_London
CSV for ID 006_London already exists. Skipping...
Processing:  007_Fifa
CSV for ID 007_Fifa already exists. Skipping...
Processing:  008_Tornados
CSV for ID 008_Tornados already exists. Skipping...
Processing:  009_Central
CSV for ID 009_Central already exists. Skipping...
Processing:  010_ECommerce
CSV for ID 010_ECommerce already exists. Skipping...
Processing:  011_SF
CSV for ID 011_SF already exists. Skipping...
Processing:  012_Heart
CSV for ID 012_Heart already exists. Skipping...
Processing:  013_Roller
CSV for ID 013_Roller already exists. Skipping...
Processing:  014_Airbnb
CSV for ID 014_Airbnb alread

In [6]:
# assign all of the qa tables
# for each need to manually assing the answer coordinate to each qa row
qa_dict.keys()

dict_keys(['001_Forbes', '002_Titanic', '003_Love', '004_Taxi', '005_NYC', '006_London', '007_Fifa', '008_Tornados', '009_Central', '010_ECommerce', '011_SF', '012_Heart', '013_Roller', '014_Airbnb', '015_Food', '016_Holiday', '017_Hacker', '018_Staff', '019_Aircraft', '020_Real', '021_Telco', '022_Airbnbs', '023_Climate', '024_Salary', '025_Data', '026_Predicting', '027_Supermarket', '028_Predict', '029_NYTimes', '030_Professionals', '031_Trustpilot', '032_Delicatessen', '033_Employee', '034_World', '035_Billboard', '036_US', '037_Ted', '038_Stroke', '039_Happy', '040_Speed', '041_Airline', '042_Predict', '043_Predict', '044_IMDb', '045_Predict', '046_120', '047_Bank', '048_Data', '049_Boris'])

In [7]:
# filter to only number and category answers for all qa dfs in qa_dict
def extract_float(answer):
    try:
        return float(answer)
    except (ValueError, TypeError):
        return np.nan

for df in qa_dict:
    qa = qa_dict[df] 
    qa = qa[qa['type'].isin(['number', 'category'])] # choose only the number and category answers
    qa = qa.drop('answer', axis = 1) # drop the answer category for the not sample dataframe
    qa = qa.loc[~qa['sample_answer'].isin(['0', 'None'])] # filter out answer of 0 or None
    qa['dataset'] = qa['dataset'] + '.csv'
    #print(qa.columns)
    qa['float_answer'] = qa['sample_answer'].apply(extract_float)
    qa_dict[df] = qa

In [8]:
# other questions to remove, make sure you only run this once
qa_dict[dfs_train[0]] = qa_dict[dfs_train[0]].iloc[:-1]
qa_dict[dfs_train[1]] = qa_dict[dfs_train[1]].reset_index(drop=True)
qa_dict[dfs_train[1]] = qa_dict[dfs_train[1]].drop([2,3,6,7])

In [9]:
# look at the qa (now on 002_Titanic)
qa_id = dfs_train[1]
qa = qa_dict[qa_id]
for q, a in zip(qa['question'], qa['sample_answer']):
    print(q)
    print('     --->', a)
    print('\n')
    
qa

How many unique passenger classes are present in the dataset?
     ---> 3


What's the maximum age of the passengers?
     ---> 69.0


Which passenger class has the highest number of survivors?
     ---> 3


What's the most common gender among the survivors?
     ---> female




Unnamed: 0,question,type,columns_used,column_types,sample_answer,dataset,float_answer
0,How many unique passenger classes are present ...,number,[Pclass],['number[uint8]'],3,002_Titanic.csv,3.0
1,What's the maximum age of the passengers?,number,[Age],['number[UInt8]'],69.0,002_Titanic.csv,69.0
4,Which passenger class has the highest number o...,category,"[Pclass, Survived]","['number[uint8]', 'boolean']",3,002_Titanic.csv,3.0
5,What's the most common gender among the surviv...,category,"[Sex, Survived]","['category', 'boolean']",female,002_Titanic.csv,


In [10]:
# works for 001_Forbes 
qa_dict[dfs_train[0]]['answer_coords'] = [
    [(11,5)], 
    [(14,6)],
    [(7,8)],
    [(18,10)],
    [(0,4)],
    [(11,9)] 
]

# working on 002_Titanic
qa_dict[dfs_train[1]]['answer_coords'] = [
    [(0,4)], # this is a fudge
    [(0,0)],
    [(0,4)],
    [(12,2)] 
]

qa_dict[dfs_train[0]]

Unnamed: 0,question,type,columns_used,column_types,sample_answer,dataset,float_answer,answer_coords
5,What is the age of the youngest billionaire?,number,['age'],['number[UInt8]'],32.0,001_Forbes.csv,32.0,"[(11, 5)]"
9,What's the rank of the wealthiest non-self-mad...,number,"['selfMade', 'rank']","['boolean', 'number[uint16]']",288,001_Forbes.csv,288.0,"[(14, 6)]"
10,Which category does the richest billionaire be...,category,"['finalWorth', 'category']","['number[uint32]', 'category']",Food & Beverage,001_Forbes.csv,,"[(7, 8)]"
11,What's the country of origin of the oldest bil...,category,"['age', 'country']","['number[UInt8]', 'category']",United Kingdom,001_Forbes.csv,,"[(18, 10)]"
12,What's the gender of the billionaire with the ...,category,"['philanthropyScore', 'gender']","['number[UInt8]', 'category']",M,001_Forbes.csv,,"[(0, 4)]"
13,What's the source of wealth for the youngest b...,category,"['age', 'source']","['number[UInt8]', 'category']",fintech,001_Forbes.csv,,"[(11, 9)]"


In [11]:
# save the forbes_001 working one as toy data set
qa_dict[dfs_train[0]].to_csv('toy_df.csv', index=False)

In [12]:
# set up the WTQ style tokenizer
config = TapasConfig.from_pretrained(
    "google/tapas-base-finetuned-wtq",
    aggregation_labels=True,  # Enable aggregation operators
)

# Initialize the tokenizer and model with the configuration
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq", config=config)



In [15]:
df_num = 0 # 0 (001_Forbes works) and 1 (002_Titanic doesnt work)
table = pd.read_csv(f'{dfs_train[df_num]}.csv').astype(str)
queries = list(qa_dict[dfs_train[df_num]]['question'])
answer_coordinates = list(qa_dict[dfs_train[df_num]]['answer_coords'])
answer_text = list(qa_dict[dfs_train[df_num]]['sample_answer'])

In [16]:
inputs = tokenizer(
    table = table,
    queries = queries,
    answer_coordinates = answer_coordinates,
    answer_text = answer_text,
    padding = "max_length",
    truncation=True,  
    return_tensors = "pt"
)

print(inputs.keys())

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


dict_keys(['input_ids', 'labels', 'numeric_values', 'numeric_values_scale', 'token_type_ids', 'attention_mask'])
