File to train on the toy_df.csv (that came from the 001_Forbes Dataset)

In [47]:
import os
import ast
import torch
import pandas as pd
from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer, AdamW

In [48]:
# load in tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")



In [49]:
# Function to convert the string representation to a list of tuples
def parse_answer_coords(coords_str):
    try:
        # Safely evaluate the string to a Python object
        coords = ast.literal_eval(coords_str)
        
        # Ensure the result is a list of tuples with integers
        if isinstance(coords, list) and all(
            isinstance(coord, (tuple, list)) and len(coord) == 2 and all(isinstance(x, int) for x in coord)
            for coord in coords
        ):
            return [tuple(coord) for coord in coords]  # Convert lists to tuples if needed
        else:
            raise ValueError(f"Invalid format for answer_coords: {coords_str}")
    except (ValueError, SyntaxError) as e:
        raise ValueError(f"Error parsing answer_coords: {coords_str}. Details: {e}")

In [50]:
# class to tokenizer dataset
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = self.process_answer_coords_column(data)
        self.tokenizer = tokenizer
        self.table_csv_path = 'data/'

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        print(item.answer_coords)
        table = pd.read_csv(self.table_csv_path + item.dataset).astype(
            str
        )  # be sure to make your table data text only
        encoding = self.tokenizer(
            table=table,
            queries=item.question,
            answer_coordinates=item.answer_coords,
            answer_text=item.sample_answer,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # remove the batch dimension which the tokenizer adds by default
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        # add the float_answer which is also required (weak supervision for aggregation case)
        encoding["float_answer"] = torch.tensor(item.float_answer)
        return encoding

    def __len__(self):
        return len(self.data)
    
    # change answer_coords from strings
    def process_answer_coords_column(self, data, column_name="answer_coords"):
        if column_name not in data.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the dataset.")

        def parse_answer_coords(coords_str):
            try:
                coords = ast.literal_eval(coords_str)
                if isinstance(coords, list) and all(
                    isinstance(coord, (tuple, list)) and len(coord) == 2 and all(isinstance(x, int) for x in coord)
                    for coord in coords
                ):
                    return [tuple(coord) for coord in coords]
                else:
                    raise ValueError(f"Invalid format for answer_coords: {coords_str}")
            except (ValueError, SyntaxError) as e:
                raise ValueError(f"Error parsing answer_coords: {coords_str}. Details: {e}")
        data[column_name] = data[column_name].apply(parse_answer_coords)
        return data

In [51]:
#TODO: generalize so that we can pull the unique csv names from the new dataset and filter through them to create a dataloader for each

# load in toy_df, tokenize and place in dataloader
csv_path = 'data/toy_df.csv'
toy_df = pd.read_csv(csv_path)

# Apply the function to the answer_coords column


# split into train and dev
toy_df_train = toy_df.iloc[[0, 2, 3, 4]]
toy_df_dev = toy_df.iloc[[1, 5]]

# load train dataloader
train_dataset = TableDataset(toy_df_train, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2)

# load dev dataloader
dev_dataset = TableDataset(toy_df_dev, tokenizer)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].apply(parse_answer_coords)


In [52]:
# load in WTQ model
config = TapasConfig(
    num_aggregation_labels=4,
    use_answer_as_supervision=True,
    answer_loss_cutoff=0.664694,
    cell_selection_preference=0.207951,
    huber_loss_delta=0.121194,
    init_cell_selection_weights_to_zero=True,
    select_one_column=True,
    allow_empty_column_selection=False,
    temperature=0.0352513,
)
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['aggregation_classifier.bias', 'aggregation_classifier.weight', 'column_output_bias', 'column_output_weights', 'output_bias', 'output_weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
# finetune the WTQ model
model.train()
for epoch in range(2):  # loop over the dataset multiple times
    for batch in train_dataloader:
        # get the inputs;
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        labels = batch["labels"]
        numeric_values = batch["numeric_values"]
        numeric_values_scale = batch["numeric_values_scale"]
        float_answer = batch["float_answer"]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )
        print(outputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
print('Done Training')

[(11, 5)]
[(7, 8)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


TableQuestionAnsweringOutput(loss=tensor(2.2632, grad_fn=<AddBackward0>), logits=tensor([[-10000., -10000., -10000.,  ..., -10000., -10000., -10000.],
        [-10000., -10000., -10000.,  ..., -10000., -10000., -10000.]],
       grad_fn=<ViewBackward0>), logits_aggregation=tensor([[ 0.2091,  0.1775, -0.0712,  0.3980],
        [ 0.0871,  0.1593, -0.1162,  0.3574]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[(18, 10)]
[(0, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


TableQuestionAnsweringOutput(loss=tensor(3.9879, grad_fn=<AddBackward0>), logits=tensor([[-10000.1250, -10000.1250, -10000.1250,  ..., -10000.1250,
         -10000.1250, -10000.1250],
        [-10000.1260, -10000.1260, -10000.1260,  ..., -10000.1260,
         -10000.1260, -10000.1260]], grad_fn=<ViewBackward0>), logits_aggregation=tensor([[ 0.2936, -0.3012, -0.4651, -0.0207],
        [ 0.2969, -0.3606, -0.4653, -0.0208]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[(11, 5)]
[(7, 8)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


TableQuestionAnsweringOutput(loss=tensor(1.8844, grad_fn=<AddBackward0>), logits=tensor([[-10000.2402, -10000.2402, -10000.2402,  ..., -10000.2402,
         -10000.2402, -10000.2402],
        [-10000.2393, -10000.2393, -10000.2393,  ..., -10000.2393,
         -10000.2393, -10000.2393]], grad_fn=<ViewBackward0>), logits_aggregation=tensor([[ 0.5836, -0.1453, -0.3825,  0.1192],
        [ 0.6485, -0.1279, -0.4293,  0.1109]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[(18, 10)]
[(0, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


TableQuestionAnsweringOutput(loss=tensor(3.6453, grad_fn=<AddBackward0>), logits=tensor([[-10000.2979, -10000.2979, -10000.2979,  ..., -10000.2979,
         -10000.2979, -10000.2979],
        [-10000.2988, -10000.2988, -10000.2988,  ..., -10000.2988,
         -10000.2988, -10000.2988]], grad_fn=<ViewBackward0>), logits_aggregation=tensor([[ 0.8348, -0.1410, -0.2435,  0.3021],
        [ 0.8507, -0.0115, -0.3043,  0.3783]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Done Training


In [53]:
# try inference

# data is the 001_forbes.csv
# inputs is the tokenized inputs (dev, but just queries and table)

table = pd.read_csv('data/001_Forbes.csv').astype(str)
queries = list(toy_df_dev['question'])

inputs = tokenizer(
    table = table,
    queries = queries,
    padding="max_length",
    return_tensors="pt"
)

outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
)

# print results:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

answers = []
for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
        # only a single cell:
        answers.append(table.iat[coordinates[0]])
    else:
        # multiple cells
        cell_values = []
        for coordinate in coordinates:
            cell_values.append(table.iat[coordinate])
        answers.append(", ".join(cell_values))

display(table)
print("")
for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
    print(query)
    if predicted_agg == "NONE":
        print("Predicted answer: " + answer)
    else:
        print("Predicted answer: " + predicted_agg + " > " + answer)

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Unnamed: 0,selfMade,finalWorth,city,title,gender,age,rank,philanthropyScore,category,source,country
0,False,7800,Atlanta,Chairman,M,74.0,296,2.0,Media & Entertainment,"media, automotive",United States
1,True,1700,Ningbo,,M,86.0,1729,,Manufacturing,precision machinery,China
2,True,2000,Wuhan,,M,49.0,1513,,Real Estate,real estate,China
3,True,1100,São Paulo,,M,69.0,2448,,Diversified,pharmaceuticals,Brazil
4,True,3300,Sao Jose dos Pinhais,,M,72.0,913,,Fashion & Retail,cosmetics,Brazil
5,False,5200,Southampton,,F,79.0,523,1.0,Media & Entertainment,"media, automotive",United States
6,False,4700,Taipei,,M,54.0,601,,Finance & Investments,financial services,Taiwan
7,True,5300,Singapore,,M,51.0,509,,Food & Beverage,restaurants,Singapore
8,True,2000,Toronto,,M,65.0,1513,,Finance & Investments,real estate finance,Canada
9,False,2600,Dubai,Athlete,M,,1196,,Diversified,diversified,United Arab Emirates



What's the rank of the wealthiest non-self-made billionaire?
Predicted answer: SUM > 
What's the source of wealth for the youngest billionaire?
Predicted answer: SUM > 


In [54]:
print("inputs")
print(inputs)
print("outputs")
print(outputs)
print("Logits:")
print(outputs.logits)
print("Aggregation Logits:")
print(outputs.logits_aggregation)

inputs
{'input_ids': tensor([[ 101, 2054, 1005,  ...,    0,    0,    0],
        [ 101, 2054, 1005,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
outputs
TableQuestionAnsweringOutput(loss=None, logits=tensor([[-88.7000, -88.7000, -88.7000,  ..., -88.7000, -88.7000, -88.7000],
        [-88.7000, -88.7000, -88.7000,  ..., -88.7000, -88.7000, -88.7000]],
       grad_fn=<ViewBackward0>), logits_aggregation=tensor([[-0.3734,  0.2418, -0.0422, -0.6087],
        

In [55]:
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
)

# Print predicted answer coordinates and aggregation indices
print("Predicted Answer Coordinates:")
print(predicted_answer_coordinates)
print("Predicted Aggregation Indices:")
print(predicted_aggregation_indices)

# apperently this error might be due to not crossing a confidence threshold so that it will not give valid answers

Predicted Answer Coordinates:
[[], []]
Predicted Aggregation Indices:
[1, 1]


In [56]:
print("Input IDs Shape:", inputs['input_ids'].shape)
print("Token Type IDs Shape:", inputs['token_type_ids'].shape)
print("Attention Mask Shape:", inputs['attention_mask'].shape)
print("Logits Shape:", outputs.logits.shape)
print("Aggregation Logits Shape:", outputs.logits_aggregation.shape)
print("Logits:", outputs.logits)
print("Aggregation Logits:", outputs.logits_aggregation)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
)
print("Predicted Coordinates:", predicted_answer_coordinates)
print("Predicted Aggregation Indices:", predicted_aggregation_indices)
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    print(batch["float_answer"])
    break

Input IDs Shape: torch.Size([2, 512])
Token Type IDs Shape: torch.Size([2, 512, 7])
Attention Mask Shape: torch.Size([2, 512])
Logits Shape: torch.Size([2, 512])
Aggregation Logits Shape: torch.Size([2, 4])
Logits: tensor([[-88.7000, -88.7000, -88.7000,  ..., -88.7000, -88.7000, -88.7000],
        [-88.7000, -88.7000, -88.7000,  ..., -88.7000, -88.7000, -88.7000]],
       grad_fn=<ViewBackward0>)
Aggregation Logits: tensor([[-0.3734,  0.2418, -0.0422, -0.6087],
        [-0.2633,  0.0688, -0.0211, -0.3979]], grad_fn=<AddmmBackward0>)
Predicted Coordinates: [[], []]
Predicted Aggregation Indices: [1, 1]
[(11, 5)]
[(7, 8)]
torch.Size([2, 512])
tensor([32., nan], dtype=torch.float64)


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
