In [1]:
#| hide

%load_ext autoreload
%autoreload 2

# Benchmark Creation

> In this notebook, we will create the splits for our benchmark.

- skip_showdoc: true
- skip_exec: true

In [1]:
#| default_exp benchmark_creation

In [None]:
import json
from claimdb.configuration import *
import pandas as pd

### Dev Set

### Helpers

In [None]:
with open(config.final_benchmark_dir / 'test-public.jsonl', 'r') as f:
    test_claim_ids = [json.loads(line)['claim_id'] for line in f]

In [21]:
len(test_claim_ids)

1000

In [22]:
all_claims = []

with open(config.output_data_dir / 'all_claims_judged.jsonl', "r") as f:

    for line in f:
        p = json.loads(line)
        if p['claim_id'] in test_claim_ids: continue
        all_claims.append(p)

    claim_map = {
        item['claim_id']: item for item in all_claims
    }

In [23]:
def print_accuracies(df, df_entailed, df_contradicted, df_nei, df_schema, df_subjective, df_counterfactual):
    print(f"Total Accuracy: {df['correct'].mean():.2%}   (total claims: {len(df)})\n")

    print(f"[ENTAILED] Accuracy: {df_entailed['correct'].mean():.2%} ({len(df_entailed)})")
    print(f"[CONTRADICTED] Accuracy: {df_contradicted['correct'].mean():.2%} ({len(df_contradicted)})")
    print(f"[NOT ENOUGH INFO] Accuracy: {df_nei['correct'].mean():.2%} ({len(df_nei)})")

    print(f"    [OUT-OF-SCHEMA] Accuracy: {df_schema['correct'].mean():.2%} ({len(df_schema)})")
    print(f"    [SUBJECTIVE] Accuracy: {df_subjective['correct'].mean():.2%} ({len(df_subjective)})")
    print(f"    [COUNTERFACTUAL] Accuracy: {df_counterfactual['correct'].mean():.2%} ({len(df_counterfactual)})")

### Load Embeddings

In [24]:
with open(config.embeddings_dir / 'out_of_schema_claim_similarity.jsonl', 'r') as f:
    schema_sims = [json.loads(line) for line in f]

with open(config.embeddings_dir / 'counterfactual_claim_similarity.jsonl', 'r') as f:
    counterfactual_sims = [json.loads(line) for line in f]

df_schema_sims = pd.DataFrame(schema_sims)
df_counterfactual_sims = pd.DataFrame(counterfactual_sims)

## All Claims

In [None]:
with open(config.output_data_dir / 'all_claims_judged.jsonl', "r") as f:
    all_claims = [json.loads(line) for line in f]

with open(config.final_benchmark_dir / 'claimdb-all-claims.jsonl', 'w') as f:
    for claim in all_claims:
        f.write(json.dumps(claim, ensure_ascii=False) + '\n')

## Original BIRD

In [7]:
with open(config.bird_dir / 'train_dev_filtered.jsonl', "r") as f:
    all_bird = [json.loads(line) for line in f]

with open(config.final_benchmark_dir / 'BIRD-source.jsonl', 'w') as f:
    for claim in all_bird:
        f.write(json.dumps(claim, ensure_ascii=False) + '\n')

## Dev Set

### Load Dev Claims

In [25]:
with open(config.bird_dir / "train_dev_filtered.jsonl", "r") as f:

    bird_id_to_ex_dict = dict()
    for line in f:
        parsed = json.loads(line)
        bird_id_to_ex_dict[parsed['bird_id']] = parsed

    dev_claim_ids = set(
        claim['claim_id'] for claim in all_claims 
        if bird_id_to_ex_dict[claim['bird_id']]['split'] == 'dev'
    )

    nei_claim_ids = set(
        claim_id for claim_id in dev_claim_ids
        if claim_map[claim_id]['label'] == 'NOT ENOUGH INFO'
    )

    contr_claim_ids = set(
        claim_id for claim_id in dev_claim_ids
        if claim_map[claim_id]['label'] == 'CONTRADICTED'
    )

    ent_claim_ids = set(
        claim_id for claim_id in dev_claim_ids
        if claim_map[claim_id]['label'] == 'ENTAILED'
    )

    schema_claim_ids = set(
        claim_id for claim_id in nei_claim_ids
        if claim_map[claim_id]['category'] == 'OUT-OF-SCHEMA'
    )

    subjective_claim_ids = set(
        claim_id for claim_id in nei_claim_ids
        if claim_map[claim_id]['category'] == 'SUBJECTIVE'
    )

    counterfactual_claim_ids = set(
        claim_id for claim_id in nei_claim_ids
        if claim_map[claim_id]['category'] == 'COUNTERFACTUAL'
    )

In [26]:
len(dev_claim_ids)

5884

In [27]:
len(contr_claim_ids), len(ent_claim_ids), len(nei_claim_ids)

(1837, 1344, 2703)

In [28]:
len(schema_claim_ids), len(subjective_claim_ids), len(counterfactual_claim_ids)

(1480, 596, 627)

In [29]:
schema_to_sample = df_schema_sims[df_schema_sims['claim_id'].isin(schema_claim_ids)]
counterfactual_to_sample = df_counterfactual_sims[df_counterfactual_sims['claim_id'].isin(counterfactual_claim_ids)]

In [30]:
len(schema_to_sample), len(counterfactual_to_sample)

(1480, 627)

### Schema/Counterfactual in Q3 quantile

In [31]:
q3_schema = schema_to_sample['similarity'].quantile(0.75)
q3_counterfactual = counterfactual_to_sample['similarity'].quantile(0.75)

In [32]:
q3_schema

np.float64(0.8597057462363621)

In [33]:
schema_quartile = schema_to_sample[schema_to_sample['similarity'] >= q3_schema]
counterfactual_quartile = counterfactual_to_sample[counterfactual_to_sample['similarity'] >= q3_counterfactual]

In [34]:
schema_claim_ids = set(schema_quartile['claim_id'])
counterfactual_claim_ids = set(counterfactual_quartile['claim_id'])

### Creation

In [35]:
import random
random.seed(8736)

contr = random.sample(list(contr_claim_ids), 331)
ent = random.sample(list(ent_claim_ids), 333)
nei_schema = random.sample(list(schema_claim_ids), 111)
nei_subjective = random.sample(list(subjective_claim_ids), 112)
nei_counterfactual = random.sample(list(counterfactual_claim_ids), 113)

custom_claim_ids = contr + ent + nei_schema + nei_subjective + nei_counterfactual

In [36]:
len(custom_claim_ids)

1000

In [2]:
#| hide
import nbdev; nbdev.nbdev_export()