In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os.path
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from os.path import join
import dvu
dvu.set_style()
from collections import defaultdict
csv_dir = join('../data', 'PECARN_Registry', 'C. CSV Datasets')

def load_df(prefix='DIAGNOSISICD10'):
    csv_files = [f for f in os.listdir(csv_dir) if f.startswith(prefix) and f.endswith('.CSV')]
    dfs = []
    for f in tqdm(csv_files):
        dfs.append(pd.read_csv(join(csv_dir, f), engine='pyarrow'))
    df = pd.concat(dfs, ignore_index=True)
    return df

- Our background
  - The original PECARN analysis was based on 12,044 patients, 203 with IAI-I	([Holmes et al. 2013](https://pubmed.ncbi.nlm.nih.gov/23375510/).)
  - In our stress-testing, we evaluate external validity of the PECARN IAI-I rule using the PSRC cohort of 2,188 patients, 62 with IAI-I ([Kornblith et al. 2022](https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000076))
  - We also performed analysis in evaluating patient perspectives with LLMs for PECARN's PediDOSE EFIC trial ([Kornblith et al. 2025](https://www.nature.com/articles/s41598-025-89996-w))
- PECARN registry metadata
  - PECARN public registry includes 5,831,284 records across 2012-2021
  - There is a fair amount of data for ICD-10 codes relevant to IAI, e.g. code S36 (5,650 records), S37 (2,026 records)
  - Among the patients with either of these codes we find that 12.50% have a GCS-Score of 14 or less, compared to the 3.53% in the rest of the cohort
  - These patients additionally have a mean pain score of 1.99, compared to 1.72 in the rest of the cohort
  - We would like to be able to link these patients to sufficient predictor variables / free text to be able to build and assess different types of models

### Load ICD-10

In [None]:
df_ICD = load_df('DIAGNOSISICD10')

In [None]:
visit_ids_relevant = []
for k in ['S36', 'S37']:  # , 'S35']:
    print(k, df_ICD['DXCode'].str.startswith(k).sum())
    visit_ids_relevant.append(
        df_ICD.loc[df_ICD['DXCode'].str.startswith(k), 'VisitID'].unique())
visit_ids_relevant = set(visit_ids_relevant[0]) & set(visit_ids_relevant[1])

### Load GCSScore

In [None]:
df_GCS = load_df('GCS')

In [None]:
# find intersection between GCS and visits
rel_idxs = df_GCS['VisitID'].isin(visit_ids_relevant)
# plt.hist(df_GCS['GCSTotal'][rel_idxs], bins=20)
sns.histplot(df_GCS['GCSTotal'][rel_idxs], bins=20, stat='probability', )
# plt.hist(df_GCS['GCSTotal'][~rel_idxs], bins=20)
sns.histplot(df_GCS['GCSTotal'][~rel_idxs], bins=20,
             stat='probability', color='red')
plt.title('GCS Distribution')

print((df_GCS[rel_idxs]['GCSTotal'] < 15).mean())
print((df_GCS[~rel_idxs]['GCSTotal'] < 15).mean())

### Load Painscores

In [50]:
df_pain = load_df('PAIN')

100%|██████████| 10/10 [00:00<00:00, 31.68it/s]


In [52]:
rel_idxs = df_pain['VisitID'].isin(visit_ids_relevant)
print(df_pain['PainScore'][rel_idxs].mean())
print(df_pain['PainScore'][~rel_idxs].mean())

1.9921982076963627
1.7201413054198782


### Vitals

In [None]:
df_vitals = load_df('VITALS')

100%|██████████| 10/10 [00:00<00:00, 11.55it/s]


In [54]:
df_vitals

Unnamed: 0,VisitID,PersonID,VitalsID,EDYear,EDMonth,VitalTimeMin,SystolicBP,DiastolicBP,HeartRate,RespiratoryRate,TempC,TempRoute,WeightKg
0,84368,1426836,1,2012,1,8,,,,,,,9.349949
1,84368,1426836,2,2012,1,13,100.0,48.0,112.0,32.0,36.9,Rectal,
2,431369,1664675,1,2012,9,9,,,,,,,53.299330
3,431369,1664675,2,2012,9,10,148.0,76.0,78.0,20.0,37.3,Oral,
4,283793,517135,1,2012,11,24,,,,,,,27.099570
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20475447,239806,2277537,3,2018,10,178,110.0,66.0,105.0,25.0,36.9,Other,
20475448,239806,2277537,2,2018,10,15,,,,,,,18.699900
20475449,578759,578330,1,2018,10,7,122.0,78.0,119.0,16.0,37.1,Oral,60.499250
20475450,578759,578330,3,2018,10,192,108.0,65.0,83.0,12.0,37.2,Other,
