In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from pathlib import Path

In [36]:

def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(
        f"Project root marker '{marker}' not found starting from {current}"
    )



In [37]:
# load the old preprocess_V2 data
root = find_project_root()

df = pd.read_parquet(root / "dataset" / "V2_preprocessed.parquet")

df.head()

# print the first 5 rows
print(df.head())




           HR      O2Sat       Temp         SBP        MAP        DBP  \
0  102.108491  91.419811  36.919203  128.165094  88.199717  67.007325   
1   97.000000  95.000000  36.919203   98.000000  75.330000  67.007325   
2   89.000000  99.000000  36.919203  122.000000  86.000000  67.007325   
3   90.000000  95.000000  36.919203  122.000000  88.665000  67.007325   
4  103.000000  88.500000  36.919203  122.000000  91.330000  67.007325   

        Resp    EtCO2  BaseExcess       HCO3  ...  MAP_mean_6h  MAP_median_6h  \
0  24.712264  29.6875    0.091837  22.811236  ...    88.199717      88.199717   
1  19.000000  29.6875    0.091837  22.811236  ...    81.764858      81.764858   
2  22.000000  29.6875    0.091837  22.811236  ...    83.176572      86.000000   
3  30.000000  29.6875   24.000000  22.811236  ...    84.548679      87.099858   
4  24.500000  29.6875    0.091837  22.811236  ...    85.904943      88.199717   

   MAP_std_6h  MAP_diff_std_6h  Resp_max_6h  Resp_min_6h  Resp_mean_6h  \


In [38]:
import pandas as pd
import numpy as np

sep_col = [
    "BaseExcess",
    "HCO3",
    "FiO2",
    "pH",
    "PaCO2",
    "SaO2",
    "AST",
    "BUN",
    "Alkalinephos",
    "Calcium",
    "Chloride",
    "Creatinine",
    "Glucose",
    "Lactate",
    "Magnesium",
    "Phosphate",
    "Potassium",
    "Bilirubin_total",
    "Hct",
    "Hgb",
    "PTT",
    "WBC",
    "Platelets",
    "Bilirubin_direct",
    "Fibrinogen",
]

# Continues Health Indicators
con_col = ["HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "EtCO2"]


def feature_missing_information(patient_data, columns):
    # temp_data holds the information from the patient file as well as the features that will be calculated
    temp_data = np.array(patient_data)

    # Calculate 3 features for each column, 2 respective of the frequency of NaN values and 1 respective of the change in recorded values
    for column in columns:
        data = np.array(patient_data[column])
        nan_pos = np.where(~np.isnan(data))[0]

        # Measurement frequency sequence
        interval_f1 = data.copy()
        # Measurement time interval
        interval_f2 = data.copy()

        # If all the values are NaN
        if len(nan_pos) == 0:
            interval_f1[:] = 0
            temp_data = np.column_stack((temp_data, interval_f1))
            interval_f2[:] = -1
            temp_data = np.column_stack((temp_data, interval_f2))
        else:
            # Puts number of measurements into temp_data
            interval_f1[: nan_pos[0]] = 0
            for p in range(len(nan_pos) - 1):
                interval_f1[nan_pos[p] : nan_pos[p + 1]] = p + 1
            interval_f1[nan_pos[-1] :] = len(nan_pos)
            temp_data = np.column_stack((temp_data, interval_f1))

            # Puts the frequency of measurements into temp_data
            interval_f2[: nan_pos[0]] = -1
            for q in range(len(nan_pos) - 1):
                length = nan_pos[q + 1] - nan_pos[q]
                for l in range(length):
                    interval_f2[nan_pos[q] + l] = l

            length = len(patient_data) - nan_pos[-1]
            for l in range(length):
                interval_f2[nan_pos[-1] + l] = l
            temp_data = np.column_stack((temp_data, interval_f2))

        # Differential features
        # These capture the change in values that have been recorded (quite simply as well but it should be just fine)
        diff_f = data.copy()
        diff_f = diff_f.astype(float)
        if len(nan_pos) <= 1:
            diff_f[:] = np.nan
            temp_data = np.column_stack((temp_data, diff_f))
        else:
            diff_f[: nan_pos[1]] = np.nan
            for p in range(1, len(nan_pos) - 1):
                diff_f[nan_pos[p] : nan_pos[p + 1]] = (
                    data[nan_pos[p]] - data[nan_pos[p - 1]]
                )
            diff_f[nan_pos[-1] :] = data[nan_pos[-1]] - data[nan_pos[-2]]
            temp_data = np.column_stack((temp_data, diff_f))

    return temp_data


def feature_slide_window(patient_data, columns):

    window_size = 6
    features = {}

    for column in columns:
        series = patient_data[column]

        features[f"{column}_max"] = series.rolling(
            window=window_size, min_periods=1
        ).max()
        features[f"{column}_min"] = series.rolling(
            window=window_size, min_periods=1
        ).min()
        features[f"{column}_mean"] = series.rolling(
            window=window_size, min_periods=1
        ).mean()
        features[f"{column}_median"] = series.rolling(
            window=window_size, min_periods=1
        ).median()
        features[f"{column}_std"] = series.rolling(
            window=window_size, min_periods=1
        ).std()

        # For calculating std dev of differences, use diff() then apply rolling std
        diff_std = series.diff().rolling(window=window_size, min_periods=1).std()
        features[f"{column}_diff_std"] = diff_std

    # Convert the dictionary of features into a DataFrame
    features_df = pd.DataFrame(features)

    return features_df


def features_score(patient_data):
    """
    Gives score assocciated with the patient data according to the scoring systems of NEWS, SOFA and qSOFA
    """

    scores = np.zeros((len(patient_data), 8))

    for ii in range(len(patient_data)):
        HR = patient_data[ii, 0]
        if HR == np.nan:
            HR_score = np.nan
        elif (HR <= 40) | (HR >= 131):
            HR_score = 3
        elif 111 <= HR <= 130:
            HR_score = 2
        elif (41 <= HR <= 50) | (91 <= HR <= 110):
            HR_score = 1
        else:
            HR_score = 0
        scores[ii, 0] = HR_score

        Temp = patient_data[ii, 2]
        if Temp == np.nan:
            Temp_score = np.nan
        elif Temp <= 35:
            Temp_score = 3
        elif Temp >= 39.1:
            Temp_score = 2
        elif (35.1 <= Temp <= 36.0) | (38.1 <= Temp <= 39.0):
            Temp_score = 1
        else:
            Temp_score = 0
        scores[ii, 1] = Temp_score

        Resp = patient_data[ii, 6]
        if Resp == np.nan:
            Resp_score = np.nan
        elif (Resp < 8) | (Resp > 25):
            Resp_score = 3
        elif 21 <= Resp <= 24:
            Resp_score = 2
        elif 9 <= Resp <= 11:
            Resp_score = 1
        else:
            Resp_score = 0
        scores[ii, 2] = Resp_score

        Creatinine = patient_data[ii, 19]
        if Creatinine == np.nan:
            Creatinine_score = np.nan
        elif Creatinine < 1.2:
            Creatinine_score = 0
        elif Creatinine < 2:
            Creatinine_score = 1
        elif Creatinine < 3.5:
            Creatinine_score = 2
        else:
            Creatinine_score = 3
        scores[ii, 3] = Creatinine_score

        MAP = patient_data[ii, 4]
        if MAP == np.nan:
            MAP_score = np.nan
        elif MAP >= 70:
            MAP_score = 0
        else:
            MAP_score = 1
        scores[ii, 4] = MAP_score

        SBP = patient_data[ii, 3]
        Resp = patient_data[ii, 6]
        if SBP + Resp == np.nan:
            qsofa = np.nan
        elif (SBP <= 100) & (Resp >= 22):
            qsofa = 1
        else:
            qsofa = 0
        scores[ii, 5] = qsofa

        Platelets = patient_data[ii, 30]
        if Platelets == np.nan:
            Platelets_score = np.nan
        elif Platelets <= 50:
            Platelets_score = 3
        elif Platelets <= 100:
            Platelets_score = 2
        elif Platelets <= 150:
            Platelets_score = 1
        else:
            Platelets_score = 0
        scores[ii, 6] = Platelets_score

        Bilirubin = patient_data[ii, 25]
        if Bilirubin == np.nan:
            Bilirubin_score = np.nan
        elif Bilirubin < 1.2:
            Bilirubin_score = 0
        elif Bilirubin < 2:
            Bilirubin_score = 1
        elif Bilirubin < 6:
            Bilirubin_score = 2
        else:
            Bilirubin_score = 3
        scores[ii, 7] = Bilirubin_score

    return scores


def extract_features(patient_data, columns_to_drop=[]):
    # Get the column with Sepsis Label as it is not the same for each row (check documentation)
    labels = np.array(patient_data["SepsisLabel"])
    patient_data = patient_data.drop(columns=columns_to_drop)

    # Gets information from the missing variables
    # This can be useful as it shows the clinical judgment, the test has not been ordered
    #                              (probably a good decision we should take into account)
    temp_data = feature_missing_information(patient_data, sep_col + con_col)
    temp = pd.DataFrame(temp_data)
    # To complete the data use forward-filling strategy
    temp = temp.ffill()
    # These are also the first set of features
    # In this configutation 99 (66 + 33 or 3 per column) features to be precise
    # They are also time indifferent
    features_A = np.array(temp)
    # The team did not use DBP, not sure why, might investigate this
    # columns = ['HR', 'O2Sat', 'SBP', 'MAP', 'Resp', 'DBP']

    # six-hour slide window statistics of selected columns
    columns = ["HR", "O2Sat", "SBP", "MAP", "Resp"]
    features_B = feature_slide_window(patient_data, columns)

    # Score features based according to NEWS, SOFA and qSOFA
    features_C = features_score(features_A)

    features = np.column_stack([features_A, features_B, features_C])

    return features, labels


# Data Pre-processing
def preprecess_data(dataset, patient_id_map=None):
    frames_features = []
    frames_labels = []
    total_patients = len(set(dataset.index.get_level_values(0)))
    i = 0
    for patient_id in set(dataset.index.get_level_values(0)):
        i += 1
        print(f"Processing patient {i} of {total_patients}",  end="\r",)
        if patient_id_map is not None:
            print(
                f"Processing data for patient ID: {patient_id}, File: {patient_id_map[patient_id]}",
                end="\r",
            )

        patient_data = dataset.loc[patient_id]

        features, labels = extract_features(patient_data)
        features = pd.DataFrame(features)
        labels = pd.DataFrame(labels)

        frames_features.append(features)
        frames_labels.append(labels)

    data_features = np.array(pd.concat(frames_features))
    data_labels = (np.array(pd.concat(frames_labels)))[:, 0]

    # Randomly shuffle the data
    index = [i for i in range(len(data_labels))]
    np.random.shuffle(index)
    data_features = data_features[index]
    data_labels = data_labels[index]

    return data_features, data_labels


def preprecess_data2(dataset, patient_id_map=None):
    frames_features = []
    frames_labels = []

    for patient_id in set(dataset.index.get_level_values(0)):
        if patient_id_map is not None:
            print(
                f"Processing data for patient ID: {patient_id}, File: {patient_id_map[patient_id]}",
                end="\r",
            )

        patient_data = dataset.loc[patient_id]

        features, labels = extract_features(patient_data)
        features = pd.DataFrame(features, index=[patient_id] * len(features))
        labels = pd.DataFrame(labels, index=[patient_id] * len(labels))

        frames_features.append(features)
        frames_labels.append(labels)

    data_features = pd.concat(frames_features)
    data_labels = pd.concat(frames_labels)

    return data_features, data_labels.squeeze()


In [39]:

def add_nan_indicators(df):
    df = df.copy()
    for column in df.columns:
        df[column + "_nan"] = df[column].isna().astype(int)
    df = df.copy()
    return df

In [40]:
# set patient id as index
df.set_index("patient_id", inplace=True)


In [41]:
dataset = df.copy()
X = add_nan_indicators(dataset)
print("preprocess data")
XX, y = preprecess_data(X)
new_feature_names = [f"new_feature_{i}" for i in range(XX.shape[1])]
XX_df = pd.DataFrame(XX, columns=new_feature_names, index=X.index)

# Concatenate the new features from XX_df back to the original DataFrame X
X = pd.concat([X, XX_df], axis=1)
y = dataset["SepsisLabel"]

  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)
  df[column + "_nan"] = df[column].isna().astype(int)


preprocess data
Processing patient 40336 of 40336

In [42]:
if X.isin([np.nan, np.inf, -np.inf]).any().any():
    print("Data contains NaN or infinite values. Handling...")
    # Replace infinite values with NaN so they can be filled too
    X.replace([np.inf, -np.inf], np.nan, inplace=True)

    # First apply forward fill
    X.fillna(method="ffill", inplace=True)
    # Then apply backward fill for any remaining NaNs
    X.fillna(method="bfill", inplace=True)

# Ensure no NaNs or infinities in the target variable as well
if y.isin([np.nan, np.inf, -np.inf]).any():
    print("Target contains NaN or infinite values. Handling...")
    y.replace([np.inf, -np.inf], np.nan, inplace=True)
    y.fillna(method="ffill", inplace=True)

Data contains NaN or infinite values. Handling...


  X.fillna(method="ffill", inplace=True)
  X.fillna(method="bfill", inplace=True)


In [43]:
# prnint the number of columns in X
print(X.shape)

# concat X and y
X["SepsisLabel"] = y

# patient id is the index make it into a normal column
X = X.reset_index(drop=False)

X.head()

(1552210, 569)


Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,new_feature_343,new_feature_344,new_feature_345,new_feature_346,new_feature_347,new_feature_348,new_feature_349,new_feature_350,new_feature_351,new_feature_352
0,1,102.108491,91.419811,36.919203,128.165094,88.199717,67.007325,24.712264,29.6875,0.091837,...,0.816497,1.169045,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0
1,1,97.0,95.0,36.919203,98.0,75.33,67.007325,19.0,29.6875,0.091837,...,1.661554,2.255992,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0
2,1,89.0,99.0,36.919203,122.0,86.0,67.007325,22.0,29.6875,0.091837,...,2.498333,3.89551,1.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0
3,1,90.0,95.0,36.919203,122.0,88.665,67.007325,30.0,29.6875,24.0,...,0.987421,1.114301,1.0,0.0,2.0,1.0,0.0,0.0,3.0,2.0
4,1,103.0,88.5,36.919203,122.0,91.33,67.007325,24.5,29.6875,0.091837,...,3.141125,3.600926,0.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0


In [44]:
# save the data
X.to_parquet(root / "dataset" / "V3_preprocessed.parquet")