In [1]:
END_TOKEN = "<END_ANSWER>"

In [2]:
def build_prompt(table, question, answer, reasoning):
    return (
        "You are a financial analyst.\n\n"
        "Table:\n"
        f"{table}\n\n"
        "Question:\n"
        f"{question}\n\n"
        "Answer:\n"
        f"{answer}\n"
        "Reasoning:\n"
        f"{reasoning}\n"
        f"{END_TOKEN}"
    )

In [3]:
def grab(text, tag):
    if tag not in text:
        return ""
    part = text.split(tag)[1]
    for t in ["[TABLE]", "[QUESTION]", "[ANSWER]"]:
        if t != tag and t in part:
            part = part.split(t)[0]
    return part.strip()

In [4]:
import json

def process_file(input_path, output_path):
    rows = []

    with open(input_path, "r", encoding="utf-8") as f:
        first = f.read(1)
        f.seek(0)

        # JSONL
        if first != "[":
            for line in f:
                rows.append(json.loads(line))
        # JSON array
        else:
            rows = json.load(f)

    final = []

    for r in rows:
        raw = r.get("text", "")

        table = grab(raw, "[TABLE]")
        question = grab(raw, "[QUESTION]")
        ans_block = grab(raw, "[ANSWER]")

        if not table or not question or not ans_block:
            continue

        # split answer | reasoning
        if "|" in ans_block:
            answer, reasoning = ans_block.split("|", 1)
        else:
            answer = ans_block
            reasoning = "Answer derived directly from the table."

        answer = answer.strip()
        reasoning = reasoning.strip()

        # build canonical sample
        sample = build_prompt(
            table=table.strip(),
            question=question.strip(),
            answer=answer,
            reasoning=reasoning,
        )

        # HARD GUARANTEE stop token
        if not sample.strip().endswith(END_TOKEN):
            sample = sample.rstrip() + "\n" + END_TOKEN

        final.append({"text": sample})

    with open(output_path, "w", encoding="utf-8") as f:
        for r in final:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"{input_path}  →  {output_path}   ({len(final)} samples)")

In [5]:
process_file("finqa_processed_train.json", "finqa_train_final.jsonl")
process_file("finqa_processed_val.json",   "finqa_val_final.jsonl")
process_file("finqa_processed_test.json",  "finqa_test_final.jsonl")

process_file("financebench_train_aug.jsonl", "financebench_train_final.jsonl")
process_file("financebench_ood_test.jsonl",  "financebench_ood_final.jsonl")

process_file("tatqa_processed_dev.jsonl",        "tatqa_dev_final.jsonl")
process_file("tatqa_processed_test_gold.jsonl",  "tatqa_test_final.jsonl")


finqa_processed_train.json  →  finqa_train_final.jsonl   (1147 samples)
finqa_processed_val.json  →  finqa_val_final.jsonl   (6251 samples)
finqa_processed_test.json  →  finqa_test_final.jsonl   (883 samples)
financebench_train_aug.jsonl  →  financebench_train_final.jsonl   (105 samples)
financebench_ood_test.jsonl  →  financebench_ood_final.jsonl   (45 samples)
tatqa_processed_dev.jsonl  →  tatqa_dev_final.jsonl   (1639 samples)
tatqa_processed_test_gold.jsonl  →  tatqa_test_final.jsonl   (1651 samples)


In [6]:
def verify_stop_tokens(path):
    bad = 0
    total = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            total += 1
            if not json.loads(line)["text"].strip().endswith(END_TOKEN):
                bad += 1
    print(f"{path}: {total - bad}/{total} have STOP TOKEN")

verify_stop_tokens("finqa_train_final.jsonl")
verify_stop_tokens("finqa_val_final.jsonl")
verify_stop_tokens("finqa_test_final.jsonl")
verify_stop_tokens("financebench_train_final.jsonl")
verify_stop_tokens("financebench_ood_final.jsonl")
verify_stop_tokens("tatqa_dev_final.jsonl")
verify_stop_tokens("tatqa_test_final.jsonl")


finqa_train_final.jsonl: 1147/1147 have STOP TOKEN
finqa_val_final.jsonl: 6251/6251 have STOP TOKEN
finqa_test_final.jsonl: 883/883 have STOP TOKEN
financebench_train_final.jsonl: 105/105 have STOP TOKEN
financebench_ood_final.jsonl: 45/45 have STOP TOKEN
tatqa_dev_final.jsonl: 1639/1639 have STOP TOKEN
tatqa_test_final.jsonl: 1651/1651 have STOP TOKEN


In [7]:
def preview_dataset(path, n=3):
    print("=" * 80)
    print("PREVIEW:", path)
    print("=" * 80)
    with open(path, "r", encoding="utf-8") as f:
        for i in range(n):
            print(json.loads(next(f))["text"])
            print("-" * 80)

preview_dataset("finqa_train_final.jsonl", 2)
preview_dataset("financebench_train_final.jsonl", 2)
preview_dataset("tatqa_test_final.jsonl", 2)

PREVIEW: finqa_train_final.jsonl
You are a financial analyst.

Table:
| amount ( in millions )
2014 net revenue | $ 5735
retail electric price | 187
volume/weather | 95
waterford 3 replacement steam generator provision | -32 ( 32 )
miso deferral | -35 ( 35 )
louisiana business combination customer credits | -107 ( 107 )
other | -14 ( 14 )
2015 net revenue | $ 5829

Question:
what is the net change in net revenue during 2015 for entergy corporation?

Answer:
94
Reasoning:
Answer derived directly from the table.
<END_ANSWER>
--------------------------------------------------------------------------------
You are a financial analyst.

Table:
( square feet in millions ) | unitedstates | othercountries | total
owned facilities1 | 30.7 | 17.2 | 47.9
leased facilities2 | 2.1 | 6.0 | 8.1
total facilities | 32.8 | 23.2 | 56.0

Question:
what percentage of total facilities as measured in square feet are leased?

Answer:
14%
Reasoning:
Answer derived directly from the table.
<END_ANSWER>
--------