In [1]:
import pandas as pd
import numpy as np

from src.tablebuilder import TableBuilder

In [2]:
%load_ext autoreload
%autoreload 2

There are 13,068 rows in the raw dataset, but this in itself is not very useful information - we really want to know how many individual neonates are in the data (some appear on multiple rows, corresponding to, for example, multiple blood culture tests). This number can be found using the unique identifier column (`Uid`):

In [3]:
data_filepath = './data/sepsis_updated_data_Feb21-Sep23.csv'
raw_df = pd.read_csv(data_filepath)
print('n rows:')
print(len(raw_df))
print('n unique ids:')
print(len(raw_df['Uid'].unique()))

n rows:
13068
n unique ids:
12392


Check start and end dates in the dataset:

In [4]:
raw_df['Datetimeadmission'] = pd.to_datetime(raw_df['Datetimeadmission'])
print('First admission:')
print(raw_df['Datetimeadmission'].min())
print('Last admission:')
print(raw_df['Datetimeadmission'].max())

First admission:
2021-02-02 12:10:50
Last admission:
2023-09-30 15:58:36


Find median birthweight:

In [5]:
raw_df[['Uid', 'Birthweight']].drop_duplicates()['Birthweight'].median()

2700.0

For case fatality rate, we assume that all those who died had the date and time of their death recorded in Neotree:

In [6]:
n_died = len(raw_df.loc[~pd.isna(raw_df['Datetimedeath']), 'Uid'].unique())
print('n died:', n_died)
n_total = len(raw_df['Uid'].unique())
print('case fatality rate:', round(n_died / n_total * 1000))

n died: 1963
case fatality rate: 158


How many had blood tests taken?:

In [7]:
print('n cases with test taken:', len(raw_df.loc[~pd.isna(raw_df['Neolab_finalbcresult']), 'Uid'].unique()))
rejected_ids = raw_df.loc[raw_df['Neolab_finalbcresult'].isin(['Contaminant', 'Rej']), 'Uid'].unique()
non_rejected_ids = raw_df.loc[raw_df['Neolab_finalbcresult'].isin(['Neg', 'NegP', 'Pos', 'PosP']), 'Uid'].unique()
print('n cases with no non-rejected tests:', len(np.setdiff1d(rejected_ids, non_rejected_ids)))

n cases with test taken: 3033
n cases with no non-rejected tests: 502


Load `datamanager` class to start preparing the raw data for analysis, first looking at the breakdown of the blood test results after removing the rejected rows:

In [8]:
from src.datamanager import DataManager

In [9]:
columns_of_interest = ['Apgar1', 'Apgar5', 'Age', 'Gender',
       'Satsair', 'Typebirth', 'Romlength', 
       'Gestation', 'Birthweight', 'Temperature', 'Skin',
       'Dangersigns', 'Signsrd', 'Wob', 'Activity', 'Umbilicus', 'Colour',
       'Rr', 'Vomiting', 'Abdomen', 'Fontanelle', 'Hr']
data_manager = DataManager(data_filepath)

How many cases are included in the analysis?

In [10]:
n_included = len(data_manager.df['Uid'].unique())
print(n_included)

11890


In [11]:
n_with_diagnosis_recorded = len(data_manager.df.loc[~pd.isna(data_manager.df['Diagdis1']), 'Uid'].unique())
n_with_eons_diagnosis = len(data_manager.df.loc[data_manager.df['eons_diagnosis'], 'Uid'].unique())
print('n_with_diagnosis_recorded:', n_with_diagnosis_recorded)
print('n_with_eons_diagnosis:', n_with_eons_diagnosis)
print(f'pct of diagnoses that were EONS: {n_with_eons_diagnosis / n_with_diagnosis_recorded * 100:.3f}')

n_with_diagnosis_recorded: 8624
n_with_eons_diagnosis: 99
pct of diagnoses that were EONS: 1.148


In [12]:
n_with_death_recorded = len(data_manager.df.loc[~pd.isna(data_manager.df['Causedeath']), 'Uid'].unique())
n_with_death_recorded_as_eons = len(data_manager.df.loc[data_manager.df['eons_cause_of_death'], 'Uid'].unique())
print('n_with_death_recorded:', n_with_death_recorded)
print('n_with_death_recorded_as_eons:', n_with_death_recorded_as_eons)
print(f'pct of causes of death that were recorded as EONS: {n_with_death_recorded_as_eons / n_with_death_recorded * 100:.3f}')

n_with_death_recorded: 1878
n_with_death_recorded_as_eons: 47
pct of causes of death that were recorded as EONS: 2.503


How many neonates with blood test results remain in the data after removing contaminated samples?:

In [13]:
len(data_manager.df.loc[~pd.isna(data_manager.df['Neolab_finalbcresult']), 'Uid'].unique())

2531

In some cases, the results of the blood test are not material to our modelling strategy, as a neonate testing negative with a positive diagnosis would still be counted as positive, because of the diagnosis. The same is true for cause of death:

In [14]:
len(data_manager.df.loc[(~pd.isna(data_manager.df['Neolab_finalbcresult'])) &
                        (data_manager.df['eons_diagnosis'] | data_manager.df['eons_cause_of_death']), 'Uid'].unique())

57

If we remove duplicate rows in the data, we can see how an explanation for the construction of the composite outcome variable breaks down:

In [15]:
data_manager.remove_duplicate_predictors(columns_of_interest, 'bc_positive_or_diagnosis_or_cause_of_death')
data_manager.df['description'].value_counts()

no_tests_taken        9270
neg_result_found      1502
all_tests_excluded     738
pos_result_found       234
diagnosis_or_death     146
Name: description, dtype: int64

In [16]:
data_manager.df['bc_positive_or_diagnosis_or_cause_of_death'].value_counts()

False    11510
True       380
Name: bc_positive_or_diagnosis_or_cause_of_death, dtype: int64

By default, the DataManager class ignores positive test results that were taken after the 72-hour threshold, or with otherwise faulty age data, so these results will already have been removed from the values above - but we can tell it not to do this:

In [17]:
ignore_problematic_ages_data_manager = DataManager(data_filepath, drop_bad_age_test_results=False)
ignore_problematic_ages_data_manager.remove_duplicate_predictors(columns_of_interest, 'bc_positive_or_diagnosis_or_cause_of_death')

This gives us the number of cases with positive test results - including LONS or those where we can't be sure of the neonate's age at the time of testing:

In [18]:
ignore_problematic_ages_data_manager.df['description'].value_counts()

no_tests_taken        9270
neg_result_found      2013
pos_result_found       461
diagnosis_or_death     146
Name: description, dtype: int64

The difference between the number of positive-tested cases under these two scenarios:

In [19]:
461 - 234

227

n diagnosed positive + n assigned cause of death + cases with positive tests who hadn't been diagnosed or assigned cause of death - n removed from positive class due to age data:

In [20]:
99 + 47 + 461 - 227

380

This is equal to the number of positive cases in our outcome variable.

We can also check how many of the 2531 tests would have been positive if we weren't also using the diagnosis and cause of death variables:

In [21]:
ignore_problematic_ages_and_diagnoses_data_manager = DataManager(data_filepath, drop_bad_age_test_results=False, include_diagnosis_and_cause_of_death=False)
ignore_problematic_ages_and_diagnoses_data_manager.remove_duplicate_predictors(columns_of_interest, 'bc_positive_or_diagnosis_or_cause_of_death')

ignore_problematic_ages_and_diagnoses_data_manager.df['description'].value_counts()

no_tests_taken      9359
neg_result_found    2052
pos_result_found     479
Name: description, dtype: int64

For the analysis of missing values:

In [22]:
all_features = ['Apgar1', 'Apgar5', 'Apgar10', 'Age', 'Gender',
       'Bsmmol', 'Satsair', 'Typebirth', 'Romlength',
       'Gestation', 'Birthweight', 'Temperature', 'Skin',
       'Dangersigns', 'Signsrd', 'Wob', 'Activity', 'Umbilicus', 'Colour',
       'Rr', 'Vomiting', 'Abdomen', 'Fontanelle', 'Hr']
is_float = data_manager.df[all_features].dtypes == 'float64'

for index, value in data_manager.df[np.array(all_features)[is_float]].isna().sum().sort_values(ascending=False).items(): 
       print(f'{index:<12} | {value:>5} | {value / len(data_manager.df):.3%}')

Bsmmol       | 11639 | 97.889%
Apgar10      | 10123 | 85.139%
Apgar5       |   851 | 7.157%
Apgar1       |   812 | 6.829%
Age          |   269 | 2.262%
Satsair      |   238 | 2.002%
Rr           |   119 | 1.001%
Birthweight  |    35 | 0.294%
Hr           |    15 | 0.126%
Gestation    |     3 | 0.025%
Temperature  |     0 | 0.000%


In [23]:
for index, value in data_manager.df[np.array(all_features)[-is_float]].isna().sum().sort_values(ascending=False).items(): 
       print(f'{index:<12} | {value:>5} | {value / len(data_manager.df):.3%}')

Wob          |  6418 | 53.978%
Romlength    |  1362 | 11.455%
Skin         |    83 | 0.698%
Abdomen      |    53 | 0.446%
Colour       |    31 | 0.261%
Vomiting     |    31 | 0.261%
Umbilicus    |    29 | 0.244%
Typebirth    |     3 | 0.025%
Signsrd      |     3 | 0.025%
Activity     |     2 | 0.017%
Fontanelle   |     2 | 0.017%
Dangersigns  |     1 | 0.008%
Gender       |     0 | 0.000%


To count values in the composite outcome variable with missing continuous values removed:

In [24]:
X_train, X_test, y_train, y_test = data_manager.get_X_y(columns_of_interest, seed=2024, y_label='bc_positive_or_diagnosis_or_cause_of_death')
y = pd.concat([y_train, y_test])
print('Total number of rows:', len(y))
pd.Series(y).value_counts()

Total number of rows: 10420


False    10094
True       326
Name: bc_positive_or_diagnosis_or_cause_of_death, dtype: int64

Finally, we generate the values for the table summarising the input data, using some fairly complex logic hidden in `./src/tablebuilder.py`:

In [25]:
table_builder = TableBuilder()
table_df = table_builder.run(data_manager)
table_df.to_csv('./output/table_df_20240125_v1.csv', index=False, header=[
    'Predictor', 'Levels', 'Total / median number of cases (n=11890)', 'Composite outcome positive (n=380)', 'Composite outcome negative (n=11510)'
])
table_df

Value(data_name='*', display_name='Yes (all feeds/ blood/ green)')


Unnamed: 0,0,1,2,3,4
0,Age (hour); median [Q1-Q3],-,2.0 [1.0-7.0],2.0 [1.0-8.0],2.0 [1.0-7.0]
1,,Missing n (%),269 (2.26),2 (0.74),267 (99.26)
2,Gender; n (%),Female,5334 (44.86),160 (3.00),5174 (97.00)
3,,Male,6532 (54.94),220 (3.37),6312 (96.63)
4,,Ambiguous,24 (0.20),0 (0.00),24 (100.00)
...,...,...,...,...,...
69,,Missing,53 (0.45),0 (0.00),53 (100.00)
70,Fontanelle; n (%),Flat,11785 (99.12),377 (3.20),11408 (96.80)
71,,Bulging,51 (0.43),1 (1.96),50 (98.04)
72,,Sunken,52 (0.44),2 (3.85),50 (96.15)


Save out the cleaned data for verification purposes:

In [26]:
data_manager.df.to_csv('./data/sepsis_updated_data_Feb21-Sep23_n11890_v2.csv', index=False)