In [None]:
#| hide

%load_ext autoreload
%autoreload 2

# Transformation of Bird Benchmark

> Here, we transform the benchmark into the Fact-Checking benchmark

In [None]:
#| default_exp transformation

In [None]:
#| export 
#| hide

import json
from claimdb.configuration import *
from claimdb.preprocess_bird import *
from pydantic import BaseModel, Field
from openai import OpenAI

In [None]:
with open(config.bird_dir / 'train_dev_filtered.jsonl', 'r') as f:
    bird_data = [json.loads(line) for line in f]

In [None]:
#| notest

client = OpenAI()

In [None]:
len(bird_data)

6854

## Contradicted Claims

### Inputs & Outputs

Here, we will define what the LLMs will receive as input and what they should output.

In [None]:
#| export
#| hide

class ContradictedClaim(BaseModel):
    contradicted_claim: str = Field(
        ...,
        description=(
            "A contradicted claim."
        )
    )

class ContradictedClaimCollection(BaseModel):
    collection: list[ContradictedClaim]

In [None]:
print(ContradictedClaim.model_fields)

{'contradicted_claim': FieldInfo(annotation=str, required=True, description='A contradicted claim.')}


In [None]:
print(ContradictedClaimCollection.model_fields)

{'collection': FieldInfo(annotation=list[ContradictedClaim], required=True)}


### Prompt

In [None]:
CONTRADICTED_INSTR = """
## Role
You are a **misleading** spokesperson  **in a controlled evaluation setting**.

## Task
Given the following inputs:
- A question
- Its correct answer
- The data domain
- Optional external knowledge (clarifications)

Your task is to produce natural language claims that are factually incompatible with the provided answer. In other words, any reader who knows the correct answer would judge your claim to be false.

## Requirements
- Each claim must be self-contained and must not use opaque references to earlier context (e.g., "the answer," "the question," "the earlier claim", etc.). Instead, any needed context should be stated explicitly within each claim.
- Each claim must contradict or be factually incompatible with the answer, directly or indirectly.
- Do not restate or explain the external knowledge; assume it is already known to the reader.
- Produce between 1 and 3 claims.
## Example

### Input
{
  "question": "Which three districts recorded the highest graduation rates in 2022?",
  "answer": [
    {
      "DistrictName": "Redwood Coast Unified",
      "GradRate": 0.97
    },
    {
      "DistrictName": "Sierra Vista Union",
      "GradRate": 0.96
    },
    {
      "DistrictName": "Mission Creek Unified",
      "GradRate": 0.95
    }
  ],
  "domain": "California Schools",
  "external-knowledge": "GradRate = Number of graduates / Total number of eligible seniors"
}

### Output
Redwood Coast Unified did not lead California's graduation rankings in 2022 — it was Riverbend Joint Unified that posted the top rate.

Sierra Vista Union is no longer among the highest graduation-rate districts in 2022.

Fairmont Hills Unified surpassed Redwood Coast Unified with 98% of its eligible seniors graduating in 2022, according to data in CA.
"""

#conservatively restrict claims to entities already present in the answer or clarifications.

### Example

In [None]:
inp = format_for_llm(prepare_bird_example(bird_data[11]))

for k, v in json.loads(inp).items():
    print(f"{k}: {v}")

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 


In [None]:
#| notest

response = client.responses.parse(
    model="gpt-5",
    input=[
        {"role": "developer", "content": CONTRADICTED_INSTR},
        {"role": "user", "content": inp}
    ],
    text_format=ContradictedClaimCollection,   # ← structured output here
)

In [None]:
#| notest

for k, v in json.loads(inp).items():
    print(f"{k}: {v}")

print()

print("Generated Contradicted Claims:")
for i, item in enumerate(response.output_parsed.collection):
    print(f"{i+1}. {item.contradicted_claim}")
    print()

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 

Generated Contradicted Claims:
1. The active district with the highest average Reading score is Cupertino Union, not Palo Alto Unified.

2. San Ramon Valley Unified leads all active districts in average Reading performance.

3. Palo Alto Unified is not the top performer in Reading; Los Gatos–Saratoga Union High School District holds the highest average score among active districts.



## Entailed Claims

### Inputs & Outputs

Here, we will define what the LLMs will receive as input and what they should output.

In [None]:
#| export
#| hide

class EntailedClaim(BaseModel):
    entailed_claim: str = Field(
        ...,
        description=(
            "An entailed claim."
        )
    )

class EntailedClaimCollection(BaseModel):
    collection: list[EntailedClaim]

In [None]:
print(EntailedClaim.model_fields)

{'entailed_claim': FieldInfo(annotation=str, required=True, description='An entailed claim.')}


In [None]:
print(EntailedClaimCollection.model_fields)

{'collection': FieldInfo(annotation=list[EntailedClaim], required=True)}


### Prompt

In [None]:
#| exports

ENTAILED_INSTR = """
## Role
You are an **honest** spokesperson **in a controlled evaluation setting**.

## Task
Given the following inputs:
- A question
- Its correct answer
- The data domain
- Optional external knowledge (clarifications)

Your task is to produce natural language claims that are consistent with and supported by the provided answer. In other words, any reader who knows the correct answer would judge your claim to be true.

## Requirements
- Each claim must be self-contained and must not use opaque references to earlier context (e.g., "the answer," "the question," "the earlier claim", etc.). Instead, any needed context should be stated explicitly within each claim.
- Each claim must follow from or be fully supported by the answer, directly or indirectly.
- Do not restate or explain the external knowledge; assume it is already known to the reader.
- Produce between 1 and 3 claims.

## Example

### Input
{
  "question": "Which three districts recorded the highest graduation rates in 2022?",
  "answer": [
    {
      "DistrictName": "Redwood Coast Unified",
      "GradRate": 0.97
    },
    {
      "DistrictName": "Sierra Vista Union",
      "GradRate": 0.96
    },
    {
      "DistrictName": "Mission Creek Unified",
      "GradRate": 0.95
    }
  ],
  "domain": "California Schools",
  "external-knowledge": "GradRate = Number of graduates / Total number of eligible seniors"
}

### Output
Redwood Coast Unified led California's graduation rankings in 2022 with a 97% rate.

In 2022, California's strongest graduation results came from Redwood Coast Unified, which saw 97% of its eligible seniors finish high school. Sierra Vista Union and Mission Creek Unified followed closely, with graduation rates of 96% and 95%, respectively.

Mission Creek Unified achieved a graduation rate of 95% in 2022, placing it among California's top three districts. It ranked just behind Redwood Coast Unified and Sierra Vista Union. The rate indicates the percentage of eligible seniors who graduated.
"""

#- Additional claims must differ in factual content, not just wording. Avoid simple paraphrases.

### Example

In [None]:
inp = format_for_llm(prepare_bird_example(bird_data[11]))

for k, v in json.loads(inp).items():
    print(f"{k}: {v}")

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 


In [None]:
#| notest

response = client.responses.parse(
    model="gpt-5",
    input=[
        {"role": "developer", "content": ENTAILED_INSTR},
        {"role": "user", "content": inp}
    ],
    text_format=EntailedClaimCollection,   # ← structured output here
)

In [None]:
#| notest

for k, v in json.loads(inp).items():
    print(f"{k}: {v}")

print()

print("Generated Entailed Claims:")
for i, item in enumerate(response.output_parsed.collection):
    print(f"{i+1}. {item.entailed_claim}")
    print()

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 

Generated Entailed Claims:
1. Among active California school districts, Palo Alto Unified has the highest average Reading score.

2. Palo Alto Unified leads all active districts in average Reading performance.



## Not Enough Info / Abstain Claims

We define abstain as a **refusion to answer dinifitively** (see [FAIR at Meta](https://arxiv.org/pdf/2506.09038)). In our dataset we will create three types of abstaintion:

1. **Out-of-Schema Facts**: If a concept has no representation in the schema, then any claim on that concept is unanswerable. No SQL query can decide it.
2. **Subjective / Evaluative claims**: If a claim is subjective or evaluative in nature, it cannot be answered definitively by the database (e.g., “dissapointing”, “impressive”, “underwhelming”, “controversial”, “widely criticized” etc.)
3. **Counterfactuals / Hypothetical Claims**: If a claim is counterfactual or hypothetical in nature, it cannot be answered definitively by the database (e.g., “If X had happened, would Y be true?”).

As you have already guessed, we will also provide the analytical schema to the LLMs so that they can identify out-of-schema facts.

### Inputs & Outputs

Here, we will define what the LLMs will receive as input and what they should output.

In [None]:
#| export
#| hide
from typing import Literal

In [None]:
#| export
#| hide

class NoInfoClaim(BaseModel):
    no_info_claim: str = Field(
        ...,
        description="A NOT ENOUGH INFO claim."
    )
    category: Literal["Out-of-Schema", "Subjective", "Counterfactual"] = Field(
        ...,
        description="The category of the NOT ENOUGH INFO claim."
    )

class NoInfoClaimCollection(BaseModel):
    collection: list[NoInfoClaim]

### Prompt

In [None]:
#| exports

NO_INFO_INSTR = """
## Role
You are a neutral spokespearson **in a controlled evaluation setting**.

## Task
Given the following inputs:
- A question
- Its correct answer
- The data domain
- The schema of the database
- Optional external knowledge (clarifications)

Your task is to produce natural language claims whose truth **cannot** be determined from the database or the given Q/A. That is, even with full access to both the database and the correct answer, these claims cannot be definitively verified or falsified.

## Requirements
- Each claim must be self-contained and must not use opaque references to earlier context (e.g., "the answer," "the question," "the earlier claim", etc.). Instead, any needed context should be stated explicitly within each claim.
- Each claim must *not* be entailed or contradicted by the answer, directly or indirectly.
- Each claim must fall into at least one of these categories:
  1. **Out-of-schema** — involves concepts the database doesn't store or represent anywhere in its schema.
  2. **Subjective/evaluative** — expresses opinions or judgments that cannot be objectively verified.
  3. **Counterfactual/hypothetical** — describes an imagined or "what if" situation that is not reflected in the actual data.
- Produce between 1 and 5 claims.
- Do not restate or explain the external knowledge; assume it is already known to the reader.
"""

# - Each claim must be self-contained. For example, they must not contain meta-information like "the answer", "the question", etc.


### Example

In [None]:
inp = format_for_llm(prepare_bird_example(bird_data[11], with_schema=True))

for k, v in json.loads(inp).items():
    if k != 'db-schema': 
        print(f"{k}: {v}")
    else:
        print(f"{k}:")
        print(v)

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 
db-schema:
Table: schools
Columns:
  - CDSCode
  - National Center for Educational Statistics school district identification number
  - National Center for Educational Statistics school identification number
  - StatusType
  - County
  - District
  - School
  - Street
  - street address 
  - City
  - Zip
  - State
  - MailStreet
  - mailing street address 
  - mailing city
  - mailing zip 
  - mailing state
  - Phone
  - extension
  - Website
  - OpenDate
  - ClosedDate
  - Charter
  - CharterNum
  - FundingType
  - District Ownership Code
  - The District Ownership Code Type
  - School Ownership Code
  - School Ownership Code Type
  - Education Option Code
  - Educational Option Name
  - Educational Instruction Level Code
  - Educational Instruction Level Name 
  - grade span offered
  - grade span served.
  - Virtual
  -

In [None]:
#| notest

response = client.responses.parse(
    model="gpt-5",
    input=[
        {"role": "developer", "content": NO_INFO_INSTR},
        {"role": "user", "content": inp}
    ],
    text_format=NoInfoClaimCollection,   # ← structured output here
)

In [None]:
#| notest

for k, v in json.loads(inp).items():
    if k != 'db-schema':
        print(f"{k}: {v}")

print()

print("Not-Enough-Info Claims:")
for i, item in enumerate(response.output_parsed.collection):
    print(f"{i+1}. [{item.category}] {item.no_info_claim}")
    print()

question: Which active district has the highest average score in Reading?
answer: [{'District': 'Palo Alto Unified'}]
domain: California Schools
external-knowledge: 

Not-Enough-Info Claims:
1. [Out-of-Schema] Palo Alto Unified's average household income is above $200,000.

2. [Subjective] Palo Alto Unified's reading curriculum is widely regarded as the most innovative in California.

3. [Counterfactual] If the SAT Reading section had used an adaptive, computer-based format instead of a fixed paper test, Palo Alto Unified would not have led the state in average Reading scores.

4. [Out-of-Schema] A majority of students in Palo Alto Unified participate in after-school robotics programs.



## Transform BIRD benchmark

In [None]:
#| export
from openai.lib._parsing._responses import type_to_text_format_param, parse_response

### Create Payload Function

See [here](https://community.openai.com/t/structured-outputs-with-batch-processing/911076/16) for more information and also [here](https://community.openai.com/t/responses-api-documentation-on-structured-outputs-is-lacking/1356632)

We will need to use the `create` API. Thus, we have to specify the output format somehow (reminder: the `.parse` API we have used in the past was doing this for us with `format_output`).

Now that we have understood how to:
1. specify the output format (i.e., `text = {"fornmat": ...}`)
2. parse the response (i.e., `parse_response(...)`)

we can create a function that will create the payload for each input in our benchmark.

In [None]:
#| export
from typing import Any

In [None]:
#| export

def construct_payload(request_id: str, # unique (for this batch) request id
                      model: str, # model name
                      instr: str, # the prompt w/ instructions
                      inp: str, # the input
                      format_type: Any # the expected output format type (e.g., NoInfoClaimCollection)
                      ) -> dict: # returns the payload dict
    """ Construct the payload for the Batch API request. """

    payload = {
        "custom_id": request_id,
        "method": "POST",
        "url": "/v1/responses",
        "body": {
            "model": model,
            "input": [
                {"role": "developer", "content": instr},
                {"role": "user", "content": inp}
            ],
            "text": {"format": type_to_text_format_param(format_type)}
        }
    }

    return payload

### Create all BIRD payloads

There are three payloads for each unique `bird_id`:
1. The payload about ENTAILED claims
2. The payload about CONTRADICTED claims
3. The payload about NOT ENOUGH INFO claims

Their difference is the prompt of course (+ the return type)!

In [None]:
with open(config.bird_dir / 'train_dev_filtered.jsonl', 'r') as f:
    bird_data = [json.loads(line) for line in f]

In [None]:
len(bird_data)

6854

In [None]:
bird_data[0].keys()

dict_keys(['question_id', 'db_id', 'question', 'evidence', 'SQL', 'difficulty', 'split', 'bird_id', 'result', 'db-schema'])

In [None]:
bird_data[0]['bird_id']

0

Now we can create all payloads

In [None]:
import tqdm

We give as `custom_id` inside OpenAI's payload system (i.e., `request_id`) the `bird_id`, split (`dev`/`test`) and the claim type (`Entailed`/`Contradicted`/`NoInfo`) so we can identify them later on (when the batch returns).

In [None]:
#| notest

payloads = []

for example in tqdm.tqdm(bird_data):
    regular_inp = format_for_llm(prepare_bird_example(example))
    noinfo_inp = format_for_llm(prepare_bird_example(example, with_schema=True))

    bird_id = example['bird_id']
    split = example['split']
    
    entailed_payload = construct_payload(
        request_id=f"{bird_id}-{split}-Entailed",
        model="gpt-5",
        instr=ENTAILED_INSTR,
        inp=regular_inp,
        format_type=EntailedClaimCollection
    )
    payloads.append(entailed_payload)

    contradicted_payload = construct_payload(
        request_id=f"{bird_id}-{split}-Contradicted",
        model="gpt-5",
        instr=CONTRADICTED_INSTR,
        inp=regular_inp,
        format_type=ContradictedClaimCollection
    )
    payloads.append(contradicted_payload)

    no_info_payload = construct_payload(
        request_id=f"{bird_id}-{split}-NoInfo",
        model="gpt-5",
        instr=NO_INFO_INSTR,
        inp=noinfo_inp,
        format_type=NoInfoClaimCollection
    )
    payloads.append(no_info_payload)

100%|██████████| 6854/6854 [00:02<00:00, 2538.66it/s]


In [None]:
#| notest
payloads[4]['custom_id']

'1-dev-Contradicted'

In [None]:
#| notest
with open(config.output_data_dir / 'openai_benchmark_payloads.jsonl', 'w') as f:
    for payload in payloads:
        f.write(json.dumps(payload) + '\n')

### Select Payloads to Send

In [None]:
with open(config.output_data_dir / 'openai_benchmark_payloads.jsonl', 'r') as f:
    payloads = [json.loads(line) for line in f]

In [None]:
payloads[0]['custom_id']

'0-dev-Entailed'

In [None]:
def select_payloads(num_examples: int) -> list[dict]:
    """ Select payloads to send that we have not tested on. """

    already_tested = set()

    tests_done_path = config.output_data_dir / 'openai_raw_results.txt'
    if tests_done_path.exists():

        with open(tests_done_path, "r") as f:
            data = f.read()
            for line in data.splitlines():
                record = json.loads(line)
                bird_id, _, label = record['custom_id'].split('-')
                already_tested.add((bird_id, label))

    i = 0
    payloads = []
    with open(config.output_data_dir / 'openai_benchmark_payloads.jsonl', 'r') as f:
        for line in f:

            payload = json.loads(line)

            bird_id, _, label = payload['custom_id'].split('-')
            if (bird_id, label) in already_tested:
                continue
        
            payloads.append(payload)
            i += 1

            if i >= num_examples:
                break

    return payloads

In [None]:
for payload in select_payloads(10):
    print(payload['custom_id'])

### Submit Batches

Here, we will use OpenAI's batch API. 

First, load the payloads and create a file with them.

In [None]:
len(select_payloads(float('inf')))

0

In [None]:
payloads = select_payloads(15658)

In [None]:
tmp_batch = config.output_data_dir / 'tmp_openai_batch_payloads.jsonl'

In [None]:
#| notest

with open(tmp_batch, 'w') as f:
    for payload in payloads:
        f.write(json.dumps(payload) + '\n')

Now, create a batch on OpenAI

In [None]:
#| notest

batch_input_file = client.files.create(
    file=open(tmp_batch, "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-GrNmFfFScZgSiJr1PR77mj', bytes=57606642, created_at=1765596985, filename='tmp_openai_batch_payloads.jsonl', object='file', purpose='batch', status='processed', expires_at=1768188985, status_details=None)


In [None]:
#| notest

batch_input_file_id = batch_input_file.id

batch_input_file_id

'file-GrNmFfFScZgSiJr1PR77mj'

Once we've successfully uploaded our input file, we can use the input File object's ID to create a batch.

In [None]:
description = "ALL BIRD"

In [None]:
#| notest

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/responses",
    completion_window="24h",
    metadata={
        "description": description
    }
)

We have to find the `batch.id` and save it for later use.

In [None]:
#| notest

batch_id = batch.id

batch_id

'batch_693cdf5698788190a0ed7b548a2fd7be'

Using the `batch.id` we can now retrieve the batch.

In [None]:
#| notest

batch = client.batches.retrieve(batch_id)

batch.status

'completed'

### Download Batch Results and Save Raw

When the batch is completed, we need to find the `output_file_id` and download the results.

In [None]:
#| notest

output_file_id = batch.output_file_id

output_file_id

'file-BL9zua8D4DJgFsRfnjK6cV'

In [None]:
#| notest

file_response = client.files.content(output_file_id)

with open(config.output_data_dir / 'openai_raw_results.txt', "a") as f:
    f.write(file_response.text)

### Parse and Save

#### Example of how to parse

We wills start with an example of a single claim response parsing.

In [None]:
with open(config.output_data_dir / 'openai_raw_results.txt', "r") as f:
    data = f.read()

#line = file_response.read().splitlines()[-1]
line = data.splitlines()[0]

record = json.loads(line)

record['custom_id']

'0-dev-Entailed'

In [None]:
type(record)

dict

In [None]:
record['custom_id']

'0-dev-Entailed'

In [None]:
record['custom_id'].split('-')

['0', 'dev', 'Entailed']

In [None]:
bird_id, _, label = record['custom_id'].split('-')

In [None]:
if label == "Entailed":
    output_format = EntailedClaimCollection
if label == "Contradicted":
    output_format = ContradictedClaimCollection
if label == "NoInfo":
    output_format = NoInfoClaimCollection

In [None]:
output_format

__main__.EntailedClaimCollection

In [None]:
from openai.types.responses import Response

In [None]:
parsed_rec = parse_response(
    response=Response.model_validate(record['response']['body']),
    text_format=output_format,
    input_tools=[]
)

In [None]:
parsed_rec.output_parsed.collection

[EntailedClaim(entailed_claim='The highest eligible free rate for K-12 students among Alameda County schools is 1.0, or 100%.'),
 EntailedClaim(entailed_claim='At least one Alameda County K-12 school has a Free Meal Count equal to its K-12 Enrollment, producing an eligible free rate of 1.0.')]

In [None]:
print(f"Generated {label} Claims:")
for i, item in enumerate(parsed_rec.output_parsed.collection):
    print(f"{i+1}. {item}")
    print()

Generated Entailed Claims:
1. entailed_claim='The highest eligible free rate for K-12 students among Alameda County schools is 1.0, or 100%.'

2. entailed_claim='At least one Alameda County K-12 school has a Free Meal Count equal to its K-12 Enrollment, producing an eligible free rate of 1.0.'



#### Parser Function

In [None]:
#| export

from openai.types.responses import Response

In [None]:
#| export

def claim_collection_json_to_parsed(claim_collection: dict  # A specific claim collection (e.g., all "ENTAILED" claims for the specific BIRD Q/A)
                         ):
    """ Parses a claim collection to the OpenAI format with the matched classes.
    Each such claim collection is about a specific BIRD Q/A and a specific label (e.g., ENTAILED).
    Returns:
        bird_id: The BIRD Q/A ID.
        label: The label of the claim collection (all claims in the collection are this).
        claim_collection_parsed: The parsed claim collection.
    """
    custom_id = claim_collection['custom_id']
    bird_id, _, label = custom_id.split("-")

    bird_id = int(bird_id)

    if label == "Entailed":
        label = "ENTAILED"
        output_format = EntailedClaimCollection
    if label == "Contradicted":
        label = "CONTRADICTED"
        output_format = ContradictedClaimCollection
    if label == "NoInfo":
        label = "NOT ENOUGH INFO"
        output_format = NoInfoClaimCollection

    claim_collection_parsed = parse_response(
        response=Response.model_validate(claim_collection['response']['body']),
        text_format=output_format,
        input_tools=[]
    )

    return bird_id, label, claim_collection_parsed

In [None]:
bird_id, label, parsed = claim_collection_json_to_parsed(record)

In [None]:
bird_id

0

In [None]:
label

'ENTAILED'

In [None]:
parsed.output_parsed

EntailedClaimCollection(collection=[EntailedClaim(entailed_claim='The highest eligible free rate for K-12 students among Alameda County schools is 1.0, or 100%.'), EntailedClaim(entailed_claim='At least one Alameda County K-12 school has a Free Meal Count equal to its K-12 Enrollment, producing an eligible free rate of 1.0.')])

In [None]:
parsed.usage

ResponseUsage(input_tokens=648, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=1109, output_tokens_details=OutputTokensDetails(reasoning_tokens=1024), total_tokens=1757)

#### Parse all and Save

**This is where we save our beautiful transformed benchmark!**

Each claim will be saved in this format:
- `claim_id`: The unique identifier of the claim.
- `bird_id`: The unique identifier of the BIRD Q/A pair that this claim is based on.
- `claim`: The text of the claim.
- `label`: The ground truth label of the claim (`ENTAILED`, `CONTRADICTED`, `NOT ENOUGH INFO`).
- OPTIONAL FIELDS:
  - `category`: For `Not Enough Info` claims, the category of abstention (`OUT-OF-SCHEMA`, `SUBJECTIVE`, `COUNTERFACTUAL`).

In [None]:
#| notest
with open(config.output_data_dir / 'openai_raw_results.txt', "r") as f:
    data = [
        claim_collection_json_to_parsed(json.loads(line))
        for line in f.read().splitlines()
    ]

In [None]:
#| notest
len(data)

20562

We also want to keep a dictionary that maps `bird_id` to the unique database name of BIRD. Let's do it:

In [None]:
#| notest

bird_id_mappings = {}

with open(config.bird_dir / 'train_dev.jsonl', 'r') as f:
    bird_data = [json.loads(line) for line in f]

for item in bird_data:
    bird_id_mappings[item['bird_id']] = {
        'db_name': item['db_id'],
        'extra_info': item['evidence']
    }

In [None]:
#| notest
bird_id_mappings[0]

{'db_name': 'california_schools',
 'extra_info': 'Eligible free rate for K-12 = `Free Meal Count (K-12)` / `Enrollment (K-12)`'}

In [None]:
#| notest
len(bird_id_mappings)

10962

In [None]:
#| notest

claim_id = 0
claims = []

for bird_id, label, claim_collection in data:
    for claim_bucket in claim_collection.output_parsed.collection:
        claim_record = {
            "bird_id": bird_id,
            "claim_id": claim_id,
            "db_name": bird_id_mappings[bird_id]['db_name'],
            "claim": None,
            "extra_info": bird_id_mappings[bird_id]['extra_info'],
            "label": label
        }

        if label == "ENTAILED":
            claim_record['claim'] = claim_bucket.entailed_claim
        if label == "CONTRADICTED":
            claim_record['claim'] = claim_bucket.contradicted_claim
        if label == "NOT ENOUGH INFO":
            claim_record['claim'] = claim_bucket.no_info_claim
            claim_record['category'] = claim_bucket.category.upper()
        
        claims.append(claim_record)
        claim_id += 1

In [None]:
#| notest
len(claims)

64894

In [None]:
#| notest
claims[1:3]

[{'bird_id': 0,
  'claim_id': 1,
  'db_name': 'california_schools',
  'claim': 'At least one Alameda County K-12 school has a Free Meal Count equal to its K-12 Enrollment, producing an eligible free rate of 1.0.',
  'extra_info': 'Eligible free rate for K-12 = `Free Meal Count (K-12)` / `Enrollment (K-12)`',
  'label': 'ENTAILED'},
 {'bird_id': 0,
  'claim_id': 2,
  'db_name': 'california_schools',
  'claim': 'No K-12 school in Alameda County has a 100% eligible free rate; the highest share is 0.96.',
  'extra_info': 'Eligible free rate for K-12 = `Free Meal Count (K-12)` / `Enrollment (K-12)`',
  'label': 'CONTRADICTED'}]

In [None]:
#| notest
claims[63000:63002]

[{'bird_id': 10670,
  'claim_id': 63000,
  'db_name': 'movie_3',
  'claim': 'BUCKET BROTHERHOOD is the film rented the most times by customers.',
  'extra_info': 'film refers to title; film rented the most times refers to title where Max(Count(rental_id))',
  'label': 'ENTAILED'},
 {'bird_id': 10670,
  'claim_id': 63001,
  'db_name': 'movie_3',
  'claim': 'Among all films, BUCKET BROTHERHOOD recorded the highest number of rentals.',
  'extra_info': 'film refers to title; film rented the most times refers to title where Max(Count(rental_id))',
  'label': 'ENTAILED'}]

In [None]:
#| notest
claims[1000]

{'bird_id': 161,
 'claim_id': 1000,
 'db_name': 'financial',
 'claim': 'The client with ID 13539 owns a junior credit card.',
 'extra_info': '',
 'label': 'ENTAILED'}

In [None]:
#| notest
# it's okay to 'w' because we 'a' on raw results.
with open(config.output_data_dir / 'all_claims.jsonl', "w") as f:
    for claim in claims:
        f.write(json.dumps(claim, ensure_ascii=False) + '\n')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()