In [1]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig

In [2]:
# Load in all qa (train and dev)
semeval_train_qa = load_dataset("cardiffnlp/databench", name="semeval", split="train")
semeval_dev_qa = load_dataset("cardiffnlp/databench", name="semeval", split="dev")

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

In [3]:
# get the names of all of the train datasets
dfs_train = list(set(semeval_train_qa['dataset']))
dfs_train = sorted(dfs_train, key=lambda x: int(x.split('_')[0]))

# get the names of all of the dev datasets
dfs_dev = list(set(semeval_dev_qa['dataset']))
dfs_dev = sorted(dfs_dev, key=lambda x: int(x.split('_')[0]))

In [4]:
qa_dict = {} # dict to store all qa 
output_folder = os.getcwd()
for table in dfs_train:
    print('Processing: ', table)
    csv_file_path = os.path.join(output_folder, f"{table}.csv")
    
    # Load the qa.parquet dataframe and store it in the dictionary
    qa = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{table}/qa.parquet")
    qa_dict[table] = qa
        
    # Skip if the CSV file already exists
    if os.path.exists(csv_file_path):
        print(f"CSV for ID {table} already exists. Skipping...")
        continue

    try:
        # Load the all.parquet dataframe and save it as CSV
        df = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{table}/sample.parquet") # loading in the lite versions with only 20 rows
        df.to_csv(csv_file_path, index=False)  #### RERUN THIS WHEN I DO THE REAL THING
        print(f"Saved CSV for ID {table} at {csv_file_path}.")

        

    except Exception as e:
        print(f"Error processing ID {table}: {e}")

Processing:  001_Forbes
CSV for ID 001_Forbes already exists. Skipping...
Processing:  002_Titanic
CSV for ID 002_Titanic already exists. Skipping...
Processing:  003_Love
CSV for ID 003_Love already exists. Skipping...
Processing:  004_Taxi
CSV for ID 004_Taxi already exists. Skipping...
Processing:  005_NYC
CSV for ID 005_NYC already exists. Skipping...
Processing:  006_London
CSV for ID 006_London already exists. Skipping...
Processing:  007_Fifa
CSV for ID 007_Fifa already exists. Skipping...
Processing:  008_Tornados
CSV for ID 008_Tornados already exists. Skipping...
Processing:  009_Central
CSV for ID 009_Central already exists. Skipping...
Processing:  010_ECommerce
CSV for ID 010_ECommerce already exists. Skipping...
Processing:  011_SF
CSV for ID 011_SF already exists. Skipping...
Processing:  012_Heart
CSV for ID 012_Heart already exists. Skipping...
Processing:  013_Roller
CSV for ID 013_Roller already exists. Skipping...
Processing:  014_Airbnb
CSV for ID 014_Airbnb alread

In [10]:
# filter to only number and category answers for all qa dfs in qa_dict
def extract_float(answer):
    try:
        return float(answer)
    except (ValueError, TypeError):
        return np.nan

for df in qa_dict:
    qa = qa_dict[df] 
    qa = qa[qa['type'].isin(['number', 'category'])]
    qa = qa.drop('answer', axis = 1)
    qa = qa.loc[~qa['sample_answer'].isin(['0', 'None'])]
    qa['dataset'] = qa['dataset'] + '.csv'
    qa['float_answer'] = qa['sample_answer'].apply(extract_float)
    qa_dict[df] = qa

KeyError: "['answer'] not found in axis"

In [None]:
# other questions to remove, make sure you only run this once
qa_dict[dfs_train[0]] = qa_dict[dfs_train[0]].iloc[:-1]
qa_dict[dfs_train[1]] = qa_dict[dfs_train[1]].reset_index(drop=True)
qa_dict[dfs_train[1]] = qa_dict[dfs_train[1]].drop([2,3,6,7])
qa_dict[dfs_train[6]] = qa_dict[dfs_train[6]].reset_index(drop=True).drop([0])
qa_dict[dfs_train[12]] = qa_dict[dfs_train[12]].reset_index(drop=True).drop([3])
qa_dict[dfs_train[14]] = qa_dict[dfs_train[14]].reset_index(drop=True).drop([0,4,5,7])
qa_dict[dfs_train[15]] = qa_dict[dfs_train[15]].reset_index(drop=True).drop([0,4,5,6,7])
qa_dict[dfs_train[17]] = qa_dict[dfs_train[17]].reset_index(drop=True).drop([6])
qa_dict[dfs_train[19]] = qa_dict[dfs_train[19]].reset_index(drop=True).drop([1])
qa_dict[dfs_train[20]] = qa_dict[dfs_train[20]].reset_index(drop=True).drop([2])

In [None]:
# 019_Aircraft (changed)
qa_dict[dfs_train[18]]['answer_coords'] = [
    [(0,4),(1,4),(2,4),(3,4),(4,4),(5,4),(6,4),(7,4),(8,4),(9,4),(10,4),(11,4),(12,4),(13,4),(14,4),(15,4),(16,4),(17,4),(18,4),(19,4)],
    [(16,3)], 
    [(0,8),(1,8),(9,8),(10,8),(11,8),(12,8),(17,8),(18,8),(19,8)],
    [(0,4)],
    [(9,11)], 
    [(3,6),(5,6),(7,6),(16,6),(15,6),(14,6)], #(3,6)
    [(16,0)]
]

In [None]:
# 018_Staff (changed)
qa_dict[dfs_train[17]]['answer_coords'] = [
    [(0,1),(1,1),(2,1),(3,1),(5,1),(9,1),(11,1),(12,1),(18,1)], 
    [(9,6)], 
    [(0,8),(1,8),(2,8),(3,8),(4,8),(5,8),(6,8),(7,8),(8,8),(9,8),(10,8),(11,8),(12,8),(13,8),(14,8),(15,8),(16,8),(17,8),(18,8),(19,8)],
    [(0,1)],
    [(1,7),(2,7),(3,7),(4,7),(5,7),(8,7),(9,7),(11,7),(12,7),(13,7),(14,7),(17,7),(18,7)], #(1,7)
    [(1,10)],
]

In [None]:
# 017_Hacker (changed)
qa_dict[dfs_train[16]]['answer_coords'] = [
    [(3,3)], 
    [(12,2)], 
    [(0,8),(1,8),(2,8),(3,8),(4,8),(5,8),(6,8),(7,8),(8,8),(9,8),(10,8),(11,8),(12,8),(13,8),(14,8),(15,8),(16,8),(17,8),(18,8),(19,8)],
    [(0,9),(2,9),(4,9),(11,9),(14,9),(15,9),(16,9),(18,9)],
    [(1,6),(2,6),(9,6),(10,6),(13,6),(15,6)], # (1,6)
    [(0,5),(1,5),(2,5),(3,5),(5,5),(6,5),(7,5),(8,5),(9,5),(10,5),(11,5),(13,5),(14,5),(16,5),(17,5),(18,5),(19,5)], # (0,5)
    [(12,9)],
    [(0,3),(2,3),(4,3),(5,3),(8,3),(9,3),(10,3),(11,3),(12,3),(13,3),(14,3),(15,3),(16,3)] #(0,3)
]

In [None]:
# 016_Holiday (no changes needed)
qa_dict[dfs_train[15]]['answer_coords'] = [
    [(0,6),(4,6),(5,6)], 
    [(0,18),(4,18),(5,18),(10,18)], 
    [(0,12),(3,12),(9,12),(10,12)]
]

In [None]:
# 015_Food (no changes needed)
qa_dict[dfs_train[14]]['answer_coords'] = [
    [(0,2),(1,2),(2,2),(3,2),(7,2),(8,2),(9,2),(18,2)], 
    [(0,3),(1,3),(2,3),(3,3),(5,3),(6,3),(7,3),(8,3),(9,3),(12,3),(15,3),(17,3),(18,3),(19,3)],   
    [(1,0),(2,0),(3,0),(4,0),(5,0),(6,0),(7,0),(8,0),(9,0),(10,0),(11,0),(12,0),(13,0),(14,0),(15,0),(16,0),(17,0),(18,0),(19,0),(0,0)], 
    [(1,1)],
]

In [None]:
# 013_Roller (no changes needed)
qa_dict[dfs_train[12]]['answer_coords'] = [
    [(12,8)], 
    [(3,5),(4,5),(9,5),(10,5),(11,5)], 
    [(12,3)],   
    [(1,6)], 
    [(15,10)]
]

In [None]:
# 012_Heart (changed)
qa_dict[dfs_train[11]]['answer_coords'] = [
    [(7,1)], 
    [(17,2)],
    [(0,0),(1,0),(4,0),(5,0),(7,0),(8,0),(10,0),(11,0),(12,0),(13,0),(14,0),(16,0),(18,0),(19,0)], 
    [(0,7),(1,7),(2,7),(3,7),(4,7),(5,7),(6,7),(7,7),(8,7),(9,7),(10,7),(11,7),(12,7),(13,7),(14,7),(15,7),(16,7),(17,7),(18,7),(19,7)],  
    [(2,5),(3,5),(5,5),(6,5),(9,5),(12,5),(14,5),(15,5),(17,5),(18,5)], #(2,5)
    [(2,6),(6,6)], #(2,6)
    [(1,4),(2,4),(3,4),(5,4),(6,4),(8,4),(10,4),(12,4),(14,4),(18,4),(19,4)], #(1,4)
    [(8,6),(10,6)] #(8,6)
]

In [None]:
# 011_SF (changes made)
qa_dict[dfs_train[10]]['answer_coords'] = [
    [(0,1),(2,1),(7,1),(8,1),(9,1),(10,1),(11,1),(13,1),(16,1),(18,1),(19,1)], 
    [(5,9)],
    [(0,7),(1,7),(3,7),(4,7),(5,7),(8,7),(9,7),(10,7),(14,7)], 
    [(0,8),(1,8),(2,8),(3,8),(4,8),(5,8),(6,8),(7,8),(8,8),(9,8),(10,8),(11,8),(12,8),(13,8),(14,8),(15,8),(16,8),(17,8),(18,8),(19,8)], # this might not work  
    [(2,1),(4,1),(5,1),(5,1),(15,1)], #(2,1)
    [(5,4),(8,4),(14,4),(17,4),(19,4)], #(5,4)
    [(5,6)],
    [(4,7),(6,7),(17,7),(16,7)] #(4,7)
]

In [None]:
# 010_ECommerce (changed)
qa_dict[dfs_train[9]]['answer_coords'] = [
    [(0,2),(1,2),(2,2),(3,2),(4,2),(5,2),(6,2),(7,2),(8,2),(9,2),(10,2),(11,2),(12,2),(13,2),(14,2),(15,2),(16,2),(17,2),(18,2),(19,2)], 
    [(0,0)], 
    [(4,5),(5,5),(7,5),(8,5),(9,5),(10,5),(13,5),(15,5),(17,5),(19,5)], # (4,5)
    [(0,1),(1,1),(2,1),(3,1),(4,1),(5,1),(6,1),(7,1),(8,1),(9,1),(10,1),(11,1),(12,1),(13,1),(14,1),(15,1),(16,1),(17,1),(18,1),(19,1),],  
    [(0,3),(4,3),(7,3),(10,3),(12,3),(13,3),(19,3)], # (0,3)
    [(0,5),(4,5),(7,5),(10,5),(12,5),(13,5),(19,5)], # (0,5)
    [(0,6),(2,6),(4,6),(7,6),(8,6),(12,6),(14,6),(15,6),(17,6),(18,6)], #(0,6)
    [(0,1)]
]

In [None]:
# 009_Central (not changes needed)
qa_dict[dfs_train[8]]['answer_coords'] = [
    [(2,1)], 
    [(6,0)], 
    [(2,4)], 
    [(1,3)], 
    [(2,2)], 
    [(6,2)], 
    [(2,2)],
    [(1,2)]
]

In [None]:
# 008_Tornadoes (changed)
qa_dict[dfs_train[7]]['answer_coords'] = [
    [(0,6),(1,6),(2,6),(3,6),(4,6),(7,6),(8,6),(9,6),(12,6),(15,6),(16,6),(18,6)], 
    [(10,3)],
    [(11,7)],
    [(16,4)],
    [(4,6),(5,6),(6,6),(17,6)], # (4,6)
    [(4,5),(6,5),(7,5),(11,5),(18,5)], #(6,5)
    [(16,0)],
    [(11,0)]
]

In [None]:
# 007_Fifa (changed)
qa_dict[dfs_train[6]]['answer_coords'] = [
    [(0,11),(1,11),(2,11),(3,11),(4,11),(5,11),(6,11),(7,11),(8,11),(9,11),(10,11),(11,11),(12,11),(13,11),(14,11),(15,11),(16,11),(17,11),(18,11)], 
    [(4,10)],
    [(11,3)],
    [(0,8),(2,8)], #(0,8)
    [(0,5),(1,5),(2,5),(4,5),(5,5),(6,5),(7,5),(8,5),(9,5),(10,5),(11,5),(12,5),(13,5),(16,5),(17,5)], #(0,5)
    [(12,11),(19,11)], #(19,11)
    [(0,3),(2,3),(4,3),(8,3),(9,3),(10,3),(12,3),(13,3),(14,3),(15,3)] #(2,3)
]

In [None]:
# 005_NYC (changed)
qa_dict[dfs_train[4]]['answer_coords'] = [
    [(0,5),(1,5),(3,5),(4,5),(5,5),(11,5),(15,5)], 
    [(14,6)], # dont get
    [(0,7),(1,7),(2,7),(3,7),(4,7),(5,7),(6,7),(8,7),(9,7),(10,7),(11,7),(13,7),(14,7),(15,7),(18,7),(19,7)],
    [(1,1),(3,1),(7,1),(9,1),(11,1),(12,1),(13,1),(14,1),(19,1)], #(1,1)
    [(4,3),(5,3),(8,3),(9,3)], #(4,3)
    [(17,4)],
    [(4,5)]
]

In [None]:
# 004_Taxi (changes made)
qa_dict[dfs_train[3]]['answer_coords'] = [
    [(2,3)], #
    [(0,8),(1,8),(2,8),(3,8),(4,8),(5,8),(6,8),(7,8),(8,8),(9,8),(10,8),(11,8),(12,8),(13,8),(15,8),(16,8),(18,8),(19,8)],
    [(0,7),(1,7),(2,7),(3,7),(4,7),(5,7),(6,7),(7,7),(8,7),(9,7),(10,7),(11,7),(12,7),(13,7),(14,7),(15,7),(16,7),(17,7),(18,7),(19,7)],
    [(0,1),(1,1),(3,1),(4,1),(5,1),(6,1),(7,1),(8,1),(9,1),(11,1),(12,1),(13,1),(15,1),(16,1),(17,1),(18,1),(19,1)], #(0,1)
    [(1,4),(2,4),(3,4),(4,4),(6,4),(7,4),(8,4),(9,4),(10,4),(12,4),(13,4),(17,4)], # (1,4)
    [(3,5),(4,5),(9,5)], #(3,5)
    [(0,2)]
]

In [11]:
# 002_Titanic (changes made)
qa_dict[dfs_train[1]]['answer_coords'] = [
    [(0, 4)],
    [(0, 0)],
    [(0, 4),(2, 4),(3, 4),(5, 4),(6, 4),(8, 4),(12, 4),(14, 4),(15, 4),(16, 4),(18, 4)], # (0, 4)
    [(12, 2)]
]

In [12]:
# 001_Forbes (not changes needed)
qa_dict[dfs_train[0]]['answer_coords'] = [
    [(11,5)], 
    [(14,6)],
    [(7,8)],
    [(18,10)],
    [(0,4)],
    [(11,9)] 
]


In [13]:
# set up the WTQ style tokenizer for testing if will tokenize
config = TapasConfig.from_pretrained(
    "google/tapas-base-finetuned-wtq",
    aggregation_labels=True,  # Enable aggregation operators
)

# Initialize the tokenizer and model with the configuration
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq", config=config)




In [14]:
working_ids = [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18]

In [26]:
for df_num in working_ids:
    try:
        table = pd.read_csv(qa_dict[dfs_train[df_num]]['dataset'][0]).astype(str)
        queries = list(qa_dict[dfs_train[df_num]]['question'])
        answer_coordinates = list(qa_dict[dfs_train[df_num]]['answer_coords'])
        answer_text = list(qa_dict[dfs_train[df_num]]['sample_answer'])
        inputs = tokenizer(
            table = table,
            queries = queries,
            answer_coordinates = answer_coordinates,
            answer_text = answer_text,
            padding = "max_length",
            truncation=True,  
            return_tensors = "pt"
        )
        print(dfs_train[df_num], ' tokenized with no error')
    except:
        print('error tokenizing ', dfs_train[df_num])

error tokenizing  001_Forbes
002_Titanic  tokenized with no error
error tokenizing  004_Taxi
error tokenizing  005_NYC
error tokenizing  007_Fifa
error tokenizing  008_Tornados
error tokenizing  009_Central
error tokenizing  010_ECommerce
error tokenizing  011_SF
error tokenizing  012_Heart
error tokenizing  013_Roller
error tokenizing  015_Food
error tokenizing  016_Holiday
error tokenizing  017_Hacker
error tokenizing  018_Staff
error tokenizing  019_Aircraft


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


In [28]:
# save the datasets in the working_ids to a dataset
working_df = pd.DataFrame()
for i in working_ids:
    df = qa_dict[dfs_train[i]]
    working_df = pd.concat([working_df, df], ignore_index=True)
    
working_df.to_csv('toy_df_v2.csv', index=False)

In [29]:
dev_ids = [40,41,42,42,44,45]
for i in dev_ids:
    df = qa_dict[dfs_train[i]]
    working_df = pd.concat([working_df, df], ignore_index=True)
    
working_df.to_csv('dev.csv', index=False)

In [30]:
test_ids = [34,35,36,37,38,39]
for i in dev_ids:
    df = qa_dict[dfs_train[i]]
    working_df = pd.concat([working_df, df], ignore_index=True)
    
working_df.to_csv('test.csv', index=False)