In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import (
    TapasConfig,
    TapasForQuestionAnswering,
    TapasTokenizer
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [2]:
import ast
import pandas as pd

def _parse_answer_coordinates(coord):
    if coord is None or pd.isna(coord):
        return []


    coord_str = str(coord).strip()

    try:
        parsed = ast.literal_eval(coord_str)
    except Exception:
       
        coord_str = coord_str.replace("(", "").replace(")", "")
        if "," in coord_str:
            r, c = coord_str.split(",")
            return [(int(r), int(c))]
        return []

    results = []

  
    if isinstance(parsed, list):
        for item in parsed:
            # item = (1,2)
            if isinstance(item, (tuple, list)):
                r, c = item
                results.append((int(r), int(c)))

            # item = "(1,2)"
            elif isinstance(item, str):
                item = item.replace("(", "").replace(")", "")
                r, c = item.split(",")
                results.append((int(r.strip()), int(c.strip())))

    elif isinstance(parsed, tuple):
        r, c = parsed
        results.append((int(r), int(c)))

    return results


In [3]:
def load_model_and_tokenizer(model_path, device):
    config = TapasConfig.from_pretrained(
        "google/tapas-base-finetuned-wikisql-supervised"
    )

    model = TapasForQuestionAnswering.from_pretrained(
        "google/tapas-base",
        config=config
    )

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

    return model, tokenizer


In [4]:
def evaluate(model, tokenizer, dataset, device):
    model.eval()

    total = 0
    correct = 0
    correct_cellSelection = 0
    correct_aggregationPrediction = 0

    with torch.no_grad():
        for i in range(len(dataset)):
            total += 1

            table = pd.read_csv(dataset.iloc[i].table_file, encoding="utf-8").astype(str)
            query = dataset.iloc[i].question

            inputs = tokenizer(
                table=table,
                queries=query,
                padding="max_length",
                return_tensors="pt"
            )

            # ---- GPU forward ----
            inputs_gpu = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs_gpu)

           
            inputs_cpu = {k: v.cpu() for k, v in inputs_gpu.items()}

            predicted_answer_coordinates, predicted_aggregation_indices = \
                tokenizer.convert_logits_to_predictions(
                    inputs_cpu,
                    outputs.logits.detach().cpu(),
                    outputs.logits_aggregation.detach().cpu()
                )

            predicted_answer_coordinates = predicted_answer_coordinates[0]
            predicted_aggregation_indices = predicted_aggregation_indices[0]

            answer_coordinates_labeled = _parse_answer_coordinates(
                dataset.iloc[i].answer_coordinates
            )
            aggregation_label_labeled = int(dataset.iloc[i].aggregation_label)

            if predicted_answer_coordinates == answer_coordinates_labeled:
                correct_cellSelection += 1

            if predicted_aggregation_indices == aggregation_label_labeled:
                correct_aggregationPrediction += 1

            if (predicted_answer_coordinates == answer_coordinates_labeled and
                predicted_aggregation_indices == aggregation_label_labeled):
                correct += 1

    return (
        correct / total,
        correct_cellSelection / total,
        correct_aggregationPrediction / total
    )


In [5]:

test_excel_csv = "val_data_all.csv"

test_dataset = pd.read_csv(test_excel_csv)

print("Test samples:", len(test_dataset))
test_dataset.head()


Test samples: 348


Unnamed: 0,id,annotator,gpt测试结果,答案,Unnamed: 4,position,question,table_file,answer_coordinates,answer_text,aggregation_label,float_answer,Unnamed: 9,澶囨敞aggregation_label涓細\n0锛歂ONE\n1锛歋UM\n2锛欰VERAGE\n3: COUNT,label
0,,,1.0,,,,What is the elevation of Ground in meters abov...,52Theparade(rvt2017)_floor.csv,"['(1, 2)']",['-46.5370539649164'],0,-46.537054,,,floor
1,,,0.0,167.0,产生幻觉,,"How many windows are in the building, do you k...",161210Med_Dent_Clinic_Combined_floor.csv,"['(8, 7)', '(9, 7)', '(13, 7)']",['94.0'],1,94.0,,,floor
2,,,1.0,,,,What is the number of stairs on EG?,20170601_Mauer_BmB_floor.csv,"['(2, 4)']",['1.0'],0,1.0,,,floor
3,,,1.0,,,,How many stairs are there in 4th Floor?,OfficeBuilding_floor.csv,"['(2, 5)']",['5.0'],0,5.0,,,floor
4,,,0.0,2.965,,,How high above the ground is Level 7 in relati...,Learningzonecorentin_floor.csv,"['(8, 2)']",['6.823'],0,6.823,,,floor


In [6]:
MODEL_PATH = r"C:\state_dict_model.pt"

model, tokenizer = load_model_and_tokenizer(MODEL_PATH, device)


Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['output_weights', 'aggregation_classifier.bias', 'column_output_bias', 'output_bias', 'aggregation_classifier.weight', 'column_output_weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
acc, acc_cell, acc_agg = evaluate(
    model=model,
    tokenizer=tokenizer,
    dataset=test_dataset,
    device=device
)

print(f"Joint Accuracy: {acc:.4f}")
print(f"Cell Selection Accuracy: {acc_cell:.4f}")
print(f"Aggregation Accuracy: {acc_agg:.4f}")


Token indices sequence length is longer than the specified maximum sequence length for this model (1237 > 512). Running this sequence through the model will result in indexing errors


Joint Accuracy: 0.8649
Cell Selection Accuracy: 0.9339
Aggregation Accuracy: 0.9282
