# Descriptive Summary of RCT Gold Standard

This notebook explores the structure and content of the gold-standard
dataset (`annotated_rct_dataset.json`).

We will:
- Inspect basic structure (rows, columns).
- Summarize PMCIDs and number of rows per article.
- Look at the distribution of `outcome_type` (e.g., binary vs continuous).
- Count how many numeric "facts" of each type are present
  (means, SDs, events, group sizes).
- List the most frequent interventions, comparators, and outcomes.


In [2]:
import json
from pathlib import Path

import pandas as pd

# Path to the gold-standard file
GOLD_PATH = Path("annotated_rct_dataset.json")

# Load JSON into a DataFrame
with GOLD_PATH.open("r", encoding="utf-8") as f:
    gold_data = json.load(f)

gold_df = pd.DataFrame(gold_data)

gold_df.head()


Unnamed: 0,id,evidence_inference_prompt_id,pmcid,outcome,intervention,comparator,outcome_type,intervention_events,intervention_group_size,comparator_events,...,comparator_standard_deviation,notes,is_data_in_figure_graphics,is_relevant_data_in_table,is_table_in_graphic_format,is_data_complete,tiktoken_with_attributes_xml_token_num,tiktoken_without_attributes_xml_token_num,tiktoken_without_attributes_markdown_token_num,split
0,1,11248,57750,Death or myocardial infarction,eptifibatide,placebo,binary,,,,...,,Group sizes are not given so the events cannot...,False,False,False,False,5811,4777,3265,DEV
1,2,11249,57750,Myocardial infarction,eptifibatide,placebo,binary,,,,...,,Group sizes are not given so the events cannot...,False,False,False,False,5811,4777,3265,DEV
2,3,11266,1216327,The mean body weight gain,White Grape Juice (WGJ),colored and flavored water (WA),continuous,,30.0,,...,,Missing standard deviations,False,True,False,False,6350,5198,3913,DEV
3,4,11267,1216327,The duration of the illness,White Grape Juice (WGJ),colored and flavored water (WA),continuous,,30.0,,...,27.4,,False,True,False,True,6350,5198,3913,DEV
4,5,11268,1216327,The fecal losses,White Grape Juice (WGJ),colored and flavored water (WA),continuous,,30.0,,...,1.63,,False,True,False,True,6350,5198,3913,DEV


In [3]:
print("Number of rows (annotation records):", len(gold_df))
print("Number of columns:", gold_df.shape[1])
print("\nColumns:\n", list(gold_df.columns))

print("\nNumber of unique PMCIDs:", gold_df["pmcid"].nunique())
print("PMCIDs example:", gold_df["pmcid"].unique()[:10])


Number of rows (annotation records): 699
Number of columns: 24

Columns:
 ['id', 'evidence_inference_prompt_id', 'pmcid', 'outcome', 'intervention', 'comparator', 'outcome_type', 'intervention_events', 'intervention_group_size', 'comparator_events', 'comparator_group_size', 'intervention_mean', 'intervention_standard_deviation', 'comparator_mean', 'comparator_standard_deviation', 'notes', 'is_data_in_figure_graphics', 'is_relevant_data_in_table', 'is_table_in_graphic_format', 'is_data_complete', 'tiktoken_with_attributes_xml_token_num', 'tiktoken_without_attributes_xml_token_num', 'tiktoken_without_attributes_markdown_token_num', 'split']

Number of unique PMCIDs: 120
PMCIDs example: [  57750 1216327 1574360 2363753 2667135 2681019 2972614 3003523 3169777
 3195393]


In [4]:
rows_per_pmcid = gold_df["pmcid"].value_counts().sort_index()

print("Rows per PMCID (first 20):")
rows_per_pmcid.head(20)


Rows per PMCID (first 20):


pmcid
57750       2
115849     29
547916      1
1216327     3
1475568    15
1574360     2
1863515     6
2363753     4
2430617     3
2596788     1
2667135     5
2681019     3
2836833     4
2952311     6
2972614    10
2974815     3
3003523     6
3136370     4
3169777    15
3195393     4
Name: count, dtype: int64

In [5]:
if "outcome_type" in gold_df.columns:
    print("Outcome type value counts:\n")
    print(gold_df["outcome_type"].value_counts(dropna=False))
else:
    print("Column 'outcome_type' not found in the dataset.")


Outcome type value counts:

outcome_type
continuous    517
binary        182
Name: count, dtype: int64


In [6]:
# Fields we treat as numeric facts
FACT_FIELDS = [
    # events
    "intervention_events",
    "comparator_events",

    # group sizes
    "intervention_group_size",
    "intervention_groupsize",
    "group_size_intervention",
    "comparator_group_size",
    "comparator_groupsize",
    "group_size_comparator",

    # means
    "intervention_mean",
    "mean_intervention",
    "comparator_mean",
    "mean_comparator",

    # standard deviations
    "intervention_standard_deviation",
    "sd_intervention",
    "comparator_standard_deviation",
    "sd_comparator",
]

MISSING_STRINGS = {
    "", "none", "nr", "not reported", "n/a", "na", "not extractable"
}

def is_present(v) -> bool:
    if v is None:
        return False
    s = str(v).strip().lower()
    return s not in MISSING_STRINGS


In [7]:
total_facts = 0
facts_by_field = {field: 0 for field in FACT_FIELDS}

for _, row in gold_df.iterrows():
    for field in FACT_FIELDS:
        if field in row and is_present(row[field]):
            total_facts += 1
            facts_by_field[field] += 1

print(f"Total gold facts (non-empty cells in FACT_FIELDS): {total_facts}\n")

facts_df = (
    pd.DataFrame(
        [{"field": field, "count": cnt} for field, cnt in facts_by_field.items()]
    )
    .sort_values("field")
)

facts_df


Total gold facts (non-empty cells in FACT_FIELDS): 2773



Unnamed: 0,field,count
1,comparator_events,158
5,comparator_group_size,643
6,comparator_groupsize,0
10,comparator_mean,310
14,comparator_standard_deviation,275
7,group_size_comparator,0
4,group_size_intervention,0
0,intervention_events,152
2,intervention_group_size,643
3,intervention_groupsize,0


In [8]:
# Count number of facts per row
row_fact_counts = []

for idx, row in gold_df.iterrows():
    c = 0
    for field in FACT_FIELDS:
        if field in row and is_present(row[field]):
            c += 1
    row_fact_counts.append(c)

gold_df["num_facts_row"] = row_fact_counts

facts_per_pmcid = gold_df.groupby("pmcid")["num_facts_row"].sum()

print("Facts per PMCID (first 20):")
facts_per_pmcid.head(20)


Facts per PMCID (first 20):


pmcid
57750       0
115849     62
547916      6
1216327    16
1475568    52
1574360    12
1863515    24
2363753    12
2430617    12
2596788     4
2667135    22
2681019     0
2836833     8
2952311    12
2972614    54
2974815    10
3003523    32
3136370    24
3169777    58
3195393    22
Name: num_facts_row, dtype: int64

In [9]:
def top_values(series, top_n=20, title=None):
    vc = series.value_counts().head(top_n)
    print("\n" + (title or "Top values") + ":")
    display(vc)

top_values(gold_df["intervention"], top_n=20, title="Top 20 interventions")
top_values(gold_df["comparator"], top_n=20, title="Top 20 comparators")
top_values(gold_df["outcome"], top_n=20, title="Top 20 outcomes")



Top 20 interventions:


intervention
Phlebotomy training programme                                                                             20
etoricoxib 90 mg once daily                                                                               19
ACVC-HA - artificial cervical vertebra and intervertebral complex (ACVC) with an HA biocoating or ACVC    15
Follow-up and thorough education on self-care                                                             15
Motivational interviewing through self-determination theory sessions                                      15
MTL+caloric intake group                                                                                  12
Intravenous lidocaine                                                                                     11
naproxen 500 mg twice daily                                                                               10
MTL group                                                                                                 10
Laser 


Top 20 comparators:


comparator
Control                                           107
placebo                                            99
Placebo                                            34
control                                            25
baseline                                           17
Standard information about self-care               15
Standard education session                         15
No label group                                     12
Intravenous morphine                               11
Placebo laser therapy (Group II)                   10
Waiting list control                               10
choices group                                      10
website without social presence elements.           9
naproxen 500 mg twice daily                         9
paracetamol (20 mg/kg) and propofol                 9
treatment as usual (TAU)                            8
no continued contact                                7
propofol based on midazolam                         7
anterior cervical


Top 20 outcomes:


outcome
adverse event profile - headache                                7
unexpected adverse events                                       7
Anxiety                                                         6
Adverse effects                                                 4
intent of purchase                                              4
Taste perception                                                4
Correct selection of the healthier product                      4
Scores on the saturated fat quiz                                4
Scores on sugar quiz                                            4
Scores on sodium quiz                                           4
Motivational Interviewing (MI) knowledge                        3
Nausea                                                          3
ACR20 responder criteria                                        3
Serum C-reactive protein                                        3
heart rate                                                      3
me

In [10]:
def has_any(row, fields):
    return any(is_present(row.get(f)) for f in fields)

gold_df["has_events"] = gold_df.apply(
    lambda r: has_any(r, ["intervention_events", "comparator_events"]), axis=1
)
gold_df["has_means"] = gold_df.apply(
    lambda r: has_any(r, ["intervention_mean", "mean_intervention",
                          "comparator_mean", "mean_comparator"]), axis=1
)

if "outcome_type" in gold_df.columns:
    print(pd.crosstab(gold_df["outcome_type"], gold_df["has_events"], margins=True))
    print()
    print(pd.crosstab(gold_df["outcome_type"], gold_df["has_means"], margins=True))
else:
    print("No 'outcome_type' column to cross-tab with.")


has_events    False  True  All
outcome_type                  
binary           30   152  182
continuous      511     6  517
All             541   158  699

has_means     False  True  All
outcome_type                  
binary          182     0  182
continuous      199   318  517
All             381   318  699
