In [None]:
import requests, zipfile, io
import os

def download_files(dir_name):
  if not os.path.exists(dir_name): 
    # 28 training examples from the SQA training set + table csv data
    urls = ["https://www.dropbox.com/s/2p6ez9xro357i63/sqa_train_set_28_examples.zip?dl=1",
            "https://www.dropbox.com/s/abhum8ssuow87h6/table_csv.zip?dl=1"
    ]
    for url in urls:
      r = requests.get(url)
      z = zipfile.ZipFile(io.BytesIO(r.content))
      z.extractall()

dir_name = "sqa_data"
download_files(dir_name)

# Loading the data

In [None]:
import pandas as pd

data = pd.read_excel("sqa_train_set_28_examples.xlsx")
data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"['(0, 4)', '(1, 4)', '(2, 4)', '(3, 4)', '(4, ...","['Louisiana State University', 'Valley HS (Las..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']


In [None]:
import ast

def _parse_answer_coordinates(answer_coordinate_str):
  """Parses the answer_coordinates of a question.
  Args:
    answer_coordinate_str: A string representation of a Python list of tuple
      strings.
      For example: "['(1, 4)','(1, 3)', ...]"
  """

  try:
    answer_coordinates = []
    # make a list of strings
    coords = ast.literal_eval(answer_coordinate_str)
    # parse each string as a tuple
    for row_index, column_index in sorted(
        ast.literal_eval(coord) for coord in coords):
      answer_coordinates.append((row_index, column_index))
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_coordinate_str)
  
  return answer_coordinates


def _parse_answer_text(answer_text):
  """Populates the answer_texts field of `answer` by parsing `answer_text`.
  Args:
    answer_text: A string representation of a Python list of strings.
      For example: "[u'test', u'hello', ...]"
    answer: an Answer object.
  """
  try:
    answer = []
    for value in ast.literal_eval(answer_text):
      answer.append(value)
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_text)

  return answer

data['answer_coordinates'] = data['answer_coordinates'].apply(lambda coords_str: _parse_answer_coordinates(coords_str))
data['answer_text'] = data['answer_text'].apply(lambda txt: _parse_answer_text(txt))

data.head(10)

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
5,nt-639,2,0,who are the players in the top 26?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
6,nt-639,2,1,"of those, which one was from louisiana state u...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
7,nt-11649,0,0,what are all the names of the teams?,table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
8,nt-11649,0,1,"of these, which teams had any losses?",table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
9,nt-11649,0,2,"of these teams, which had more than 21 losses?",table_csv/204_135.csv,"[(15, 1)]",[CD Villarrobledo]


Let's create a new dataframe that groups questions which are asked in a sequence related to the table. We can do this by adding a `sequence_id` column, which is a combination of the `id` and `annotator` columns:

In [None]:
def get_sequence_id(example_id, annotator):
  if "-" in str(annotator):
    raise ValueError('"-" not allowed in annotator.')
  return f"{example_id}-{annotator}"

data['sequence_id'] = data.apply(lambda x: get_sequence_id(x.id, x.annotator), axis=1)
data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text,sequence_id
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve...",nt-639-0
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-0
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-1


In [None]:
# let's group table-question pairs by sequence id, and remove some columns we don't need 
grouped = data.groupby(by='sequence_id').agg(lambda x: x.tolist())
grouped = grouped.drop(columns=['id', 'annotator', 'position'])
grouped['table_file'] = grouped['table_file'].apply(lambda x: x[0])
grouped.head(10)

Unnamed: 0_level_0,question,table_file,answer_coordinates,answer_text
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ns-1292-0,"[who are all the athletes?, where are they fro...",table_csv/204_521.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Tommy Green, Janis Dalins, Ugo Frigerio, Kar..."
nt-10730-0,[what was the production numbers of each revol...,table_csv/203_253.csv,"[[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, ...","[[1,900 (estimated), 14,500 (estimated), 6,000..."
nt-10730-1,[what three revolver models had the least amou...,table_csv/203_253.csv,"[[(0, 0), (6, 0), (7, 0)], [(0, 0)]]","[[Remington-Beals Army Model Revolver, New Mod..."
nt-10730-2,"[what are all of the remington models?, how ma...",table_csv/203_253.csv,"[[(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, ...","[[Remington-Beals Army Model Revolver, Remingt..."
nt-11649-0,"[what are all the names of the teams?, of thes...",table_csv/204_135.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Cordoba CF, CD Malaga, Granada CF, UD Las Pa..."
nt-11649-1,"[what are the losses?, what team had more than...",table_csv/204_135.csv,"[[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (5, ...","[[6, 6, 9, 10, 10, 12, 12, 11, 13, 14, 15, 14,..."
nt-11649-2,"[what were all the teams?, what were the loss ...",table_csv/204_135.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Cordoba CF, CD Malaga, Granada CF, UD Las Pa..."
nt-639-0,"[where are the players from?, which player wen...",table_csv/203_149.csv,"[[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, ...","[[Louisiana State University, Valley HS (Las V..."
nt-639-1,"[who are the players?, which ones are in the t...",table_csv/203_149.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Ben McDonald, Tyler Houston, Roger Salkeld, ..."
nt-639-2,"[who are the players in the top 26?, of those,...",table_csv/203_149.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Ben McDonald, Tyler Houston, Roger Salkeld, ..."


In [None]:
table_csv_path = "table_csv"

item = grouped.iloc[0]
table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) 

display(table)
print("")
print(item.question)

Unnamed: 0,Rank,Name,Nationality,Time (hand),Notes
0,,Tommy Green,Great Britain,4:50:10,OR
1,,Janis Dalins,Latvia,4:57:20,
2,,Ugo Frigerio,Italy,4:59:06,
3,4.0,Karl Hahnel,Germany,5:06:06,
4,5.0,Ettore Rivolta,Italy,5:07:39,
5,6.0,Paul Sievert,Germany,5:16:41,
6,7.0,Henri Quintric,France,5:27:25,
7,8.0,Ernie Crosbie,United States,5:28:02,
8,9.0,Bill Chisholm,United States,5:51:00,
9,10.0,Alfred Maasik,Estonia,6:19:00,



['who are all the athletes?', 'where are they from?', 'along with paul sievert, which athlete is from germany?']


In [None]:
import torch
from transformers import TapasTokenizer

# initialize the tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

In [None]:
encoding = tokenizer(table=table, queries=item.question, answer_coordinates=item.answer_coordinates, answer_text=item.answer_text,
                     truncation=True, padding="max_length", return_tensors="pt")
encoding.keys()

dict_keys(['input_ids', 'labels', 'numeric_values', 'numeric_values_scale', 'token_type_ids', 'attention_mask'])

In [None]:
tokenizer.decode(encoding["input_ids"][0])

'[CLS] who are all the athletes? [SEP] rank name nationality time ( hand ) notes [EMPTY] tommy green great britain 4 : 50 : 10 or [EMPTY] janis dalins latvia 4 : 57 : 20 [EMPTY] [EMPTY] ugo frigerio italy 4 : 59 : 06 [EMPTY] 4. 0 karl hahnel germany 5 : 06 : 06 [EMPTY] 5. 0 ettore rivolta italy 5 : 07 : 39 [EMPTY] 6. 0 paul sievert germany 5 : 16 : 41 [EMPTY] 7. 0 henri quintric france 5 : 27 : 25 [EMPTY] 8. 0 ernie crosbie united states 5 : 28 : 02 [EMPTY] 9. 0 bill chisholm united states 5 : 51 : 00 [EMPTY] 10. 0 alfred maasik estonia 6 : 19 : 00 [EMPTY] [EMPTY] henry cieman canada [EMPTY] dnf [EMPTY] john moralis greece [EMPTY] dnf [EMPTY] francesco pretti italy [EMPTY] dnf [EMPTY] arthur tell schwab switzerland [EMPTY] dnf [EMPTY] harry hinkel united states [EMPTY] dnf [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
for id, prev_label in zip (encoding["input_ids"][1], encoding["token_type_ids"][1][:,3]):
  if id != 0: # we skip padding tokens
    print(tokenizer.decode([id]), prev_label.item())

[CLS] 0
where 0
are 0
they 0
from 0
? 0
[SEP] 0
rank 0
name 0
nationality 0
time 0
( 0
hand 0
) 0
notes 0
[EMPTY] 0
tommy 1
green 1
great 0
britain 0
4 0
: 0
50 0
: 0
10 0
or 0
[EMPTY] 0
jan 1
##is 1
dali 1
##ns 1
latvia 0
4 0
: 0
57 0
: 0
20 0
[EMPTY] 0
[EMPTY] 0
u 1
##go 1
fr 1
##iger 1
##io 1
italy 0
4 0
: 0
59 0
: 0
06 0
[EMPTY] 0
4 0
. 0
0 0
karl 1
hahn 1
##el 1
germany 0
5 0
: 0
06 0
: 0
06 0
[EMPTY] 0
5 0
. 0
0 0
et 1
##tore 1
ri 1
##vo 1
##lta 1
italy 0
5 0
: 0
07 0
: 0
39 0
[EMPTY] 0
6 0
. 0
0 0
paul 1
si 1
##ever 1
##t 1
germany 0
5 0
: 0
16 0
: 0
41 0
[EMPTY] 0
7 0
. 0
0 0
henri 1
qui 1
##nt 1
##ric 1
france 0
5 0
: 0
27 0
: 0
25 0
[EMPTY] 0
8 0
. 0
0 0
ernie 1
cr 1
##os 1
##bie 1
united 0
states 0
5 0
: 0
28 0
: 0
02 0
[EMPTY] 0
9 0
. 0
0 0
bill 1
chi 1
##sho 1
##lm 1
united 0
states 0
5 0
: 0
51 0
: 0
00 0
[EMPTY] 0
10 0
. 0
0 0
alfred 1
ma 1
##asi 1
##k 1
estonia 0
6 0
: 0
19 0
: 0
00 0
[EMPTY] 0
[EMPTY] 0
henry 1
ci 1
##eman 1
canada 0
[EMPTY] 0
d 0
##n 0
##f 0
[EMPTY] 0

In [None]:
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) # TapasTokenizer expects the table data to be text only
        if item.position != 0:
          # use the previous table-question pair to correctly set the prev_labels token type ids
          previous_item = self.df.iloc[idx-1]
          encoding = self.tokenizer(table=table, 
                                    queries=[previous_item.question, item.question], 
                                    answer_coordinates=[previous_item.answer_coordinates, item.answer_coordinates], 
                                    answer_text=[previous_item.answer_text, item.answer_text],
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
          )
          # use encodings of second table-question pair in the batch
          encoding = {key: val[-1] for key, val in encoding.items()}
        else:
          # this means it's the first table-question pair in a sequence
          encoding = self.tokenizer(table=table, 
                                    queries=item.question, 
                                    answer_coordinates=item.answer_coordinates, 
                                    answer_text=item.answer_text,
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
          )
          # remove the batch dimension which the tokenizer adds 
          encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding

    def __len__(self):
        return len(self.df)

train_dataset = TableDataset(df=data, tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2)

In [None]:
for id, prev_label in zip(batch["input_ids"][1], batch["token_type_ids"][1][:,3]):
  if id != 0:
    print(tokenizer.decode([id]), prev_label.item())

[CLS] 0
which 0
player 0
went 0
to 0
louisiana 0
state 0
university 0
? 0
[SEP] 0
pick 0
player 0
team 0
position 0
school 0
1 0
ben 0
mcdonald 0
baltimore 0
orioles 0
r 0
##hp 0
louisiana 1
state 1
university 1
2 0
tyler 0
houston 0
atlanta 0
braves 0
c 0
valley 1
hs 1
( 1
las 1
vegas 1
, 1
n 1
##v 1
) 1
3 0
roger 0
sal 0
##kel 0
##d 0
seattle 0
mariners 0
r 0
##hp 0
sa 1
##ug 1
##us 1
( 1
ca 1
) 1
hs 1
4 0
jeff 0
jackson 0
philadelphia 0
phillies 0
of 0
simeon 1
hs 1
( 1
chicago 1
, 1
il 1
) 1
5 0
donald 0
harris 0
texas 0
rangers 0
of 0
texas 1
tech 1
university 1
6 0
paul 0
coleman 0
saint 0
louis 0
cardinals 0
of 0
franks 1
##ton 1
( 1
tx 1
) 1
hs 1
7 0
frank 0
thomas 0
chicago 0
white 0
sox 0
1b 0
auburn 1
university 1
8 0
earl 0
cunningham 0
chicago 0
cubs 0
of 0
lancaster 1
( 1
sc 1
) 1
hs 1
9 0
kyle 0
abbott 0
california 0
angels 0
l 0
##hp 0
long 1
beach 1
state 1
university 1
10 0
charles 0
johnson 0
montreal 0
expo 0
##s 0
c 0
westwood 1
hs 1
( 1
fort 1
pierce 1
, 1
fl 1
) 

# Training loop

In [None]:
from transformers import TapasForQuestionAnswering

model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1432.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442768791.0, style=ProgressStyle(descri…




Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['column_output_bias', 'output_bias', 'column_output_weights', 'output_weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TapasForQuestionAnswering(
  (tapas): TapasModel(
    (embeddings): TapasEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings_0): Embedding(3, 768)
      (token_type_embeddings_1): Embedding(256, 768)
      (token_type_embeddings_2): Embedding(256, 768)
      (token_type_embeddings_3): Embedding(2, 768)
      (token_type_embeddings_4): Embedding(256, 768)
      (token_type_embeddings_5): Embedding(256, 768)
      (token_type_embeddings_6): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.07, inplace=False)
    )
    (encoder): TapasEncoder(
      (layer): ModuleList(
        (0): TapasLayer(
          (attention): TapasAttention(
            (self): TapasSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # loop over the dataset multiple times
   print("Epoch:", epoch)
   for idx, batch in enumerate(train_dataloader):
        # get the inputs;
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                       labels=labels)
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()

Epoch: 0
Loss: 2.283051013946533
Loss: 2.321470022201538
Loss: 1.304502010345459
Loss: 1.8132498264312744
Loss: 1.5585863590240479
Loss: 2.8958029747009277
Loss: 2.3229925632476807
Loss: 2.9768738746643066
Loss: 2.43325138092041
Loss: 2.588594913482666
Loss: 2.432821273803711
Loss: 2.077129602432251
Loss: 2.5189807415008545
Loss: 1.117794394493103
Epoch: 1
Loss: 2.6343977451324463
Loss: 1.1567877531051636
Loss: 0.8736000061035156
Loss: 1.1256351470947266
Loss: 1.193580985069275
Loss: 2.019536018371582
Loss: 1.9377449750900269
Loss: 2.7245376110076904
Loss: 2.4017510414123535
Loss: 1.299104928970337
Loss: 1.1747502088546753
Loss: 1.3556476831436157
Loss: 1.2996423244476318
Loss: 0.8333960771560669
Epoch: 2
Loss: 1.1792304515838623
Loss: 0.7969248294830322
Loss: 0.5043489336967468
Loss: 0.8913598656654358
Loss: 1.618265986442566
Loss: 1.099987268447876
Loss: 0.8575657606124878
Loss: 1.231074333190918
Loss: 1.5082621574401855
Loss: 0.736372709274292
Loss: 0.9882394671440125
Loss: 0.855469