In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score, precision_recall_curve, PrecisionRecallDisplay, average_precision_score
from lightgbm import LGBMClassifier
import optuna
from dataclasses import dataclass, field

There are 13,068 rows in the raw dataset, but this in itself is not very useful information - we really want to know how many individual neonates are in the data (some appear on multiple rows, corresponding to, for example, multiple blood culture tests). This number can be found using the unique identifier column (`Uid`):

In [5]:
data_filepath = './data/sepsis_updated_data_Feb21-Sep23.csv'
raw_df = pd.read_csv(data_filepath)
print('n rows:')
print(len(raw_df))
print('n unique ids:')
print(len(raw_df['Uid'].unique()))

n rows:
13068
n unique ids:
12392


Check start and end dates in the dataset:

In [6]:
raw_df['Datetimeadmission'] = pd.to_datetime(raw_df['Datetimeadmission'])
print('First admission:')
print(raw_df['Datetimeadmission'].min())
print('Last admission:')
print(raw_df['Datetimeadmission'].max())

First admission:
2021-02-02 12:10:50
Last admission:
2023-09-30 15:58:36


Find median birthweight:

In [9]:
raw_df[['Uid', 'Birthweight']].drop_duplicates()['Birthweight'].median()

2700.0

For case fatality rate, we assume that all those who died had the date and time of their death recorded in Neotree:

In [14]:
n_died = len(raw_df.loc[~pd.isna(raw_df['Datetimedeath']), 'Uid'].unique())
print('n died:', n_died)
n_total = len(raw_df['Uid'].unique())
print('case fatality rate:', round(n_died / n_total * 1000))

n died: 1963
case fatality rate: 158


How many had blood tests taken, and what were the results?:

In [22]:
print('n cases with test taken:', len(raw_df.loc[~pd.isna(raw_df['Neolab_finalbcresult']), 'Uid'].unique()))
rejected_ids = raw_df.loc[raw_df['Neolab_finalbcresult'].isin(['Contaminant', 'Rej']), 'Uid'].unique()
non_rejected_ids = raw_df.loc[raw_df['Neolab_finalbcresult'].isin(['Neg', 'NegP', 'Pos', 'PosP']), 'Uid'].unique()
print('n cases with no non-rejected tests:', len(np.setdiff1d(rejected_ids, non_rejected_ids)))

n cases with test taken: 3033
n cases with no non-rejected tests: 502


Load `datamanager` class to start preparing the raw data for analysis, first looking at the breakdown of the blood test results after removing the rejected rows:

In [23]:
from src.datamanager import DataManager

data_manager = DataManager(data_filepath)

In [25]:
data_manager.df.loc[~pd.isna(data_manager.df['Neolab_finalbcresult'])].sort_values(
    ['Neolab_status', 'bc_positive'],
    ascending=[True, False]
).groupby('Uid').head(1)['bc_positive'].value_counts()

False    2052
True      479
Name: bc_positive, dtype: int64

In [26]:
2052 + 479

2531