# Summary Stats


In [1]:
import sqlite3

import httpx
import pandas as pd

from litreview.database import SCHEMA

pd.set_option("display.float_format", lambda x: f"{x:0.1f}")

## Retrieve Integer Fields

In [2]:
API_BASE = "https://clinicaltrials.gov/api/v2"
API_FIELD_VALUES = API_BASE + "/stats/field/values"

response = httpx.get(API_FIELD_VALUES, params={"types": "INTEGER|NUMBER"})
response.raise_for_status()
data = response.json()
integer_fields = [field["piece"] for field in data]

In [3]:
connection = sqlite3.connect("../clinical_trials.db")
cursor = connection.cursor()

In [4]:
df = pd.read_sql_query("SELECT EnrollmentCount FROM Study", connection).describe()
df

Unnamed: 0,EnrollmentCount
count,493084.0
mean,5352.8
std,481901.4
min,0.0
25%,30.0
50%,69.0
75%,198.0
max,188814085.0


In [11]:
df = pd.DataFrame()
for table, fields in SCHEMA.items():
    current_fields = [field for field in integer_fields if field in fields]
    if current_fields:
        query = f"SELECT {', '.join(current_fields)} FROM {table}"
        df = pd.concat([df, pd.read_sql(query, connection).describe()], axis=1)

df

Unnamed: 0,EnrollmentCount,EventGroupDeathsNumAffected,EventGroupDeathsNumAtRisk,EventGroupOtherNumAffected,EventGroupOtherNumAtRisk,EventGroupSeriousNumAffected,EventGroupSeriousNumAtRisk,LargeDocSize,OtherEventStatsNumAffected,OtherEventStatsNumAtRisk,OtherEventStatsNumEvents,SeriousEventStatsNumAffected,SeriousEventStatsNumAtRisk
count,493084.0,88120.0,88120.0,158608.0,157777.0,158608.0,157854.0,53626.0,0.0,0.0,0.0,0.0,0.0
mean,5352.8,7.1,185.0,39.3,164.8,13.4,173.0,1681869.3,,,,,
std,481901.4,94.5,7261.2,185.8,5461.2,118.8,5489.7,3248562.4,,,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12395.0,,,,,
25%,30.0,0.0,9.0,0.0,10.0,0.0,10.0,332641.2,,,,,
50%,69.0,0.0,25.0,6.0,29.0,0.0,30.0,710891.5,,,,,
75%,198.0,1.0,70.0,26.0,82.0,4.0,83.0,1524950.0,,,,,
max,188814085.0,17251.0,2019461.0,11846.0,2019461.0,13289.0,2019461.0,33548883.0,,,,,
unique,,,,,,,,,0.0,0.0,0.0,0.0,0.0
top,,,,,,,,,,,,,
