File to train on the toy_df.csv (that came from the 001_Forbes Dataset)

In [2]:
import os
import ast
import torch
import pandas as pd
from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer, AdamW

In [3]:
# load in tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")



In [4]:
# Function to convert the string representation to a list of tuples
def parse_answer_coords(coords_str):
    try:
        # Safely evaluate the string to a Python object
        coords = ast.literal_eval(coords_str)
        
        # Ensure the result is a list of tuples with integers
        if isinstance(coords, list) and all(
            isinstance(coord, (tuple, list)) and len(coord) == 2 and all(isinstance(x, int) for x in coord)
            for coord in coords
        ):
            return [tuple(coord) for coord in coords]  # Convert lists to tuples if needed
        else:
            raise ValueError(f"Invalid format for answer_coords: {coords_str}")
    except (ValueError, SyntaxError) as e:
        raise ValueError(f"Error parsing answer_coords: {coords_str}. Details: {e}")

In [5]:
# class to tokenizer dataset
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = self.process_answer_coords_column(data)
        self.tokenizer = tokenizer
        self.table_csv_path = 'data/'

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        table = pd.read_csv(self.table_csv_path + item.dataset).astype(
            str
        )  # be sure to make your table data text only
        encoding = self.tokenizer(
            table=table,
            queries=item.question,
            answer_coordinates=item.answer_coords,
            answer_text=item.sample_answer,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # remove the batch dimension which the tokenizer adds by default
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        # add the float_answer which is also required (weak supervision for aggregation case)
        encoding["float_answer"] = torch.tensor(item.float_answer)
        return encoding

    def __len__(self):
        return len(self.data)
    
    # change answer_coords from strings
    def process_answer_coords_column(self, data, column_name="answer_coords"):
        if column_name not in data.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the dataset.")

        def parse_answer_coords(coords_str):
            try:
                coords = ast.literal_eval(coords_str)
                if isinstance(coords, list) and all(
                    isinstance(coord, (tuple, list)) and len(coord) == 2 and all(isinstance(x, int) for x in coord)
                    for coord in coords
                ):
                    return [tuple(coord) for coord in coords]
                else:
                    raise ValueError(f"Invalid format for answer_coords: {coords_str}")
            except (ValueError, SyntaxError) as e:
                raise ValueError(f"Error parsing answer_coords: {coords_str}. Details: {e}")
        data[column_name] = data[column_name].apply(parse_answer_coords)
        return data

In [31]:
# load in toy_df, tokenize and place in dataloader
csv_path = 'data/toy_df.csv'
 
toy_df = pd.read_csv(csv_path)
datasets = list(set(toy_df['dataset']))

dev_datasets = datasets[0]
train_datasets = datasets[1:]
# Apply the function to the answer_coords column

# split into train and dev
toy_df_train = toy_df[toy_df['dataset'].isin(train_datasets)]
toy_df_dev = toy_df[toy_df['dataset'] == dev_datasets]

# load train dataloader
train_dataset = TableDataset(toy_df_train, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4)

# load dev dataloader
dev_dataset = TableDataset(toy_df_dev, tokenizer)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].apply(parse_answer_coords)


In [21]:
# load in WTQ model
config = TapasConfig(
    num_aggregation_labels=4,
    use_answer_as_supervision=True,
    answer_loss_cutoff=0.664694,
    cell_selection_preference=0.207951,
    huber_loss_delta=0.121194,
    init_cell_selection_weights_to_zero=True,
    select_one_column=True,
    allow_empty_column_selection=False,
    temperature=0.0352513,
)
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq", config=config)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [22]:
# finetune the WTQ model
model.train()
for epoch in range(10):
    print('On epoch:', epoch) # loop over the dataset multiple times
    for batch in train_dataloader:
        # get the inputs;
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        labels = batch["labels"]
        numeric_values = batch["numeric_values"]
        numeric_values_scale = batch["numeric_values_scale"]
        float_answer = batch["float_answer"]
        
        if isinstance(numeric_values, pd.Series):
            numeric_values = numeric_values.iloc[:] 
            
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
print('Done Training')

On epoch: 0
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(7, 8)]
[(18, 10)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]
[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]
[(17, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(11, 7)]
[(16, 4)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]
[(16, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 1
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(7, 8)]
[(18, 10)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(0, 4)]
[(0, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(1, 4)]
[(3, 5)]


  cell = row[col_index]


[(0, 2)]
[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(17, 4)]
[(4, 5)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 7)]
[(16, 4)]
[(4, 6)]
[(6, 5)]
[(16, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 2
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(7, 8)]
[(18, 10)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(0, 4)]
[(0, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]
[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]
[(1, 1)]
[(4, 3)]
[(17, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(11, 7)]
[(16, 4)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]
[(16, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 0)]
On epoch: 3
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]
[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]
[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(17, 4)]
[(4, 5)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(11, 7)]
[(16, 4)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(16, 0)]
[(11, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 4
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]
[(0, 4)]
[(12, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(2, 3)]
[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]
[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(17, 4)]
[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 7)]
[(16, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(16, 0)]
[(11, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 5
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(2, 3)]
[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  cell = row[col_index]


[(1, 1)]
[(4, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(17, 4)]
[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 7)]
[(16, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(16, 0)]
[(11, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 6
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(2, 3)]
[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]
[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]
[(17, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 7)]
[(16, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(16, 0)]
[(11, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 7
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(1, 4)]
[(3, 5)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]
[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]
[(17, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(11, 7)]
[(16, 4)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(16, 0)]
[(11, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 8
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]
[(2, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(1, 4)]
[(3, 5)]


  cell = row[col_index]


[(0, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]
[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]
[(17, 4)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(11, 7)]
[(16, 4)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(4, 6)]
[(6, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(16, 0)]
[(11, 0)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


On epoch: 9
[(11, 5)]
[(14, 6)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(7, 8)]
[(18, 10)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(11, 9)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(0, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 4)]
[(12, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(2, 3)]
[(0, 8), (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (10, 8), (11, 8), (12, 8), (13, 8), (15, 8), (16, 8), (18, 8), (19, 8)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (7, 7), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7)]
[(1, 1)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 4)]
[(3, 5)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 2)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(0, 5), (1, 5), (3, 5), (4, 5), (5, 5), (11, 5), (15, 5)]
[(14, 6)]
[(0, 7), (1, 7), (2, 7), (3, 7), (4, 7), (5, 7), (6, 7), (8, 7), (9, 7), (10, 7), (11, 7), (13, 7), (14, 7), (15, 7), (18, 7), (19, 7)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(1, 1)]
[(4, 3)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)


[(17, 4)]
[(4, 5)]


  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (7, 6), (8, 6), (9, 6), (12, 6), (15, 6), (16, 6), (18, 6)]
[(10, 3)]
[(11, 7)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(16, 4)]
[(4, 6)]
[(6, 5)]
[(16, 0)]


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


[(11, 0)]
Done Training


In [33]:
# try inference

# data is the dev_dataset
# inputs is the tokenized inputs (dev, but just queries and table)
print('dev on:', dev_datasets)
table = pd.read_csv(f'data/{dev_datasets}').astype(str)
queries = list(toy_df_dev['question'])
print(queries)
inputs = tokenizer(
    table = table,
    queries = queries,
    padding="max_length",
    return_tensors="pt",
    truncation = True
)

outputs = model(**inputs)
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
)

# print results:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

answers = []
for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
        # only a single cell:
        answers.append(table.iat[coordinates[0]])
    else:
        # multiple cells
        cell_values = []
        for coordinate in coordinates:
            cell_values.append(table.iat[coordinate])
        answers.append(", ".join(cell_values))

display(table)
print("")

#TODO: add functionality to make it so that you can see the true answer too
for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
    print(query)
    if predicted_agg == "NONE":
        print("Predicted answer: " + answer)
    else:
        print("Predicted answer: " + predicted_agg + " > " + answer)

dev on: 007_Fifa.csv
['How many unique clubs are there in the dataset?', 'What is the highest value (in €) of a player in the dataset?', "How many players have the position 'ST'?", 'What is the most common nationality in the dataset?', 'What is the most common preferred foot amongst players?', 'Which club has the most players in the dataset?', 'What is the most common position of players in the dataset?']


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Unnamed: 0,Joined<gx:date>,Overall<gx:number>,Age<gx:number>,Position<gx:category>,Wage_€<gx:currency>,Preferred Foot<gx:category>,Potential<gx:number>,Agility<gx:number>,Nationality<gx:category>,Height_ft<gx:number>,Value_€<gx:currency>,Club<gx:category>
0,,72,26,SUB,19000,Right,72,74.0,Ghana,5.11,3300000,Hannover 96
1,"Jul 1, 2018",66,20,RDM,6000,Right,78,90.0,Luxembourg,5.9,1100000,1. FSV Mainz 05
2,"Sep 1, 2020",73,29,SUB,22000,Right,73,88.0,Ghana,5.8,3800000,Hellas Verona
3,"Jan 1, 2018",57,21,RES,500,Left,68,64.0,Uruguay,6.1,180000,River Plate Montevideo
4,"Jan 1, 2015",66,29,SUB,10000,Right,66,52.0,Saudi Arabia,6.1,550000,Al Hilal
5,"Aug 20, 2020",56,18,RES,2000,Right,67,54.0,England,5.9,130000,Burnley
6,"Aug 24, 2017",64,22,RES,18000,Right,71,52.0,Netherlands,6.4,625000,Leeds United
7,"Jul 21, 2018",77,23,LM,29000,Right,86,86.0,Ivory Coast,5.9,13500000,Sassuolo
8,"Jul 1, 2017",73,31,SUB,16000,Right,73,62.0,Netherlands,6.1,3600000,FC Basel 1893
9,"Nov 23, 2019",65,26,SUB,3000,Right,68,56.0,Spain,6.3,675000,CD Lugo



How many unique clubs are there in the dataset?
Predicted answer: COUNT > Hannover 96, 1. FSV Mainz 05, Hellas Verona, River Plate Montevideo, Al Hilal, Burnley, Leeds United, Sassuolo, FC Basel 1893, CD Lugo, SC Paderborn 07, Ulsan Hyundai FC, Lech Poznań, SG Dynamo Dresden
What is the highest value (in €) of a player in the dataset?
Predicted answer: 13500000
How many players have the position 'ST'?
Predicted answer: COUNT > RDM, SUB, ST, SUB, GK
What is the most common nationality in the dataset?
Predicted answer: Poland
What is the most common preferred foot amongst players?
Predicted answer: 
Which club has the most players in the dataset?
Predicted answer: Leeds United, Inter
What is the most common position of players in the dataset?
Predicted answer: SUB, SUB


In [None]:
#TODO: calculate accuracy