In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from pathlib import Path

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:

def find_project_root(marker=".gitignore"):
    """
    walk up from the current working directory until a directory containing the
    specified marker (e.g., .gitignore) is found.
    """
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent.resolve()
    raise FileNotFoundError(
        f"Project root marker '{marker}' not found starting from {current}"
    )



In [4]:
root = find_project_root()
DATA_PATH = root / "dataset"
SAVED_DATA_PATH = root / "dataset/dsv4.parquet"

In [5]:
# load the old preprocess_V2 data


In [6]:
import pandas as pd
import numpy as np

sep_col = [
    "BaseExcess",
    "HCO3",
    "FiO2",
    "pH",
    "PaCO2",
    "SaO2",
    "AST",
    "BUN",
    "Alkalinephos",
    "Calcium",
    "Chloride",
    "Creatinine",
    "Glucose",
    "Lactate",
    "Magnesium",
    "Phosphate",
    "Potassium",
    "Bilirubin_total",
    "Hct",
    "Hgb",
    "PTT",
    "WBC",
    "Platelets",
    "Bilirubin_direct",
    "Fibrinogen",
]

# Continues Health Indicators
con_col = ["HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "EtCO2"]


def feature_missing_information(patient_data, columns):
    # temp_data holds the information from the patient file as well as the features that will be calculated
    temp_data = np.array(patient_data)

    # Calculate 3 features for each column, 2 respective of the frequency of NaN values and 1 respective of the change in recorded values
    for column in columns:
        data = np.array(patient_data[column])
        nan_pos = np.where(~np.isnan(data))[0]

        # Measurement frequency sequence
        interval_f1 = data.copy()
        # Measurement time interval
        interval_f2 = data.copy()

        # If all the values are NaN
        if len(nan_pos) == 0:
            interval_f1[:] = 0
            temp_data = np.column_stack((temp_data, interval_f1))
            interval_f2[:] = -1
            temp_data = np.column_stack((temp_data, interval_f2))
        else:
            # Puts number of measurements into temp_data
            interval_f1[: nan_pos[0]] = 0
            for p in range(len(nan_pos) - 1):
                interval_f1[nan_pos[p] : nan_pos[p + 1]] = p + 1
            interval_f1[nan_pos[-1] :] = len(nan_pos)
            temp_data = np.column_stack((temp_data, interval_f1))

            # Puts the frequency of measurements into temp_data
            interval_f2[: nan_pos[0]] = -1
            for q in range(len(nan_pos) - 1):
                length = nan_pos[q + 1] - nan_pos[q]
                for l in range(length):
                    interval_f2[nan_pos[q] + l] = l

            length = len(patient_data) - nan_pos[-1]
            for l in range(length):
                interval_f2[nan_pos[-1] + l] = l
            temp_data = np.column_stack((temp_data, interval_f2))

        # Differential features
        # These capture the change in values that have been recorded (quite simply as well but it should be just fine)
        diff_f = data.copy()
        diff_f = diff_f.astype(float)
        if len(nan_pos) <= 1:
            diff_f[:] = np.nan
            temp_data = np.column_stack((temp_data, diff_f))
        else:
            diff_f[: nan_pos[1]] = np.nan
            for p in range(1, len(nan_pos) - 1):
                diff_f[nan_pos[p] : nan_pos[p + 1]] = (
                    data[nan_pos[p]] - data[nan_pos[p - 1]]
                )
            diff_f[nan_pos[-1] :] = data[nan_pos[-1]] - data[nan_pos[-2]]
            temp_data = np.column_stack((temp_data, diff_f))

    return temp_data


def feature_slide_window(patient_data, columns):

    window_size = 6
    features = {}

    for column in columns:
        series = patient_data[column]

        features[f"{column}_max"] = series.rolling(
            window=window_size, min_periods=1
        ).max()
        features[f"{column}_min"] = series.rolling(
            window=window_size, min_periods=1
        ).min()
        features[f"{column}_mean"] = series.rolling(
            window=window_size, min_periods=1
        ).mean()
        features[f"{column}_median"] = series.rolling(
            window=window_size, min_periods=1
        ).median()
        features[f"{column}_std"] = series.rolling(
            window=window_size, min_periods=1
        ).std()

        # For calculating std dev of differences, use diff() then apply rolling std
        diff_std = series.diff().rolling(window=window_size, min_periods=1).std()
        features[f"{column}_diff_std"] = diff_std

    # Convert the dictionary of features into a DataFrame
    features_df = pd.DataFrame(features)

    return features_df


def features_score(patient_data):
    """
    Gives score assocciated with the patient data according to the scoring systems of NEWS, SOFA and qSOFA
    """

    scores = np.zeros((len(patient_data), 8))

    for ii in range(len(patient_data)):
        HR = patient_data[ii, 0]
        if HR == np.nan:
            HR_score = np.nan
        elif (HR <= 40) | (HR >= 131):
            HR_score = 3
        elif 111 <= HR <= 130:
            HR_score = 2
        elif (41 <= HR <= 50) | (91 <= HR <= 110):
            HR_score = 1
        else:
            HR_score = 0
        scores[ii, 0] = HR_score

        Temp = patient_data[ii, 2]
        if Temp == np.nan:
            Temp_score = np.nan
        elif Temp <= 35:
            Temp_score = 3
        elif Temp >= 39.1:
            Temp_score = 2
        elif (35.1 <= Temp <= 36.0) | (38.1 <= Temp <= 39.0):
            Temp_score = 1
        else:
            Temp_score = 0
        scores[ii, 1] = Temp_score

        Resp = patient_data[ii, 6]
        if Resp == np.nan:
            Resp_score = np.nan
        elif (Resp < 8) | (Resp > 25):
            Resp_score = 3
        elif 21 <= Resp <= 24:
            Resp_score = 2
        elif 9 <= Resp <= 11:
            Resp_score = 1
        else:
            Resp_score = 0
        scores[ii, 2] = Resp_score

        Creatinine = patient_data[ii, 19]
        if Creatinine == np.nan:
            Creatinine_score = np.nan
        elif Creatinine < 1.2:
            Creatinine_score = 0
        elif Creatinine < 2:
            Creatinine_score = 1
        elif Creatinine < 3.5:
            Creatinine_score = 2
        else:
            Creatinine_score = 3
        scores[ii, 3] = Creatinine_score

        MAP = patient_data[ii, 4]
        if MAP == np.nan:
            MAP_score = np.nan
        elif MAP >= 70:
            MAP_score = 0
        else:
            MAP_score = 1
        scores[ii, 4] = MAP_score

        SBP = patient_data[ii, 3]
        Resp = patient_data[ii, 6]
        if SBP + Resp == np.nan:
            qsofa = np.nan
        elif (SBP <= 100) & (Resp >= 22):
            qsofa = 1
        else:
            qsofa = 0
        scores[ii, 5] = qsofa

        Platelets = patient_data[ii, 30]
        if Platelets == np.nan:
            Platelets_score = np.nan
        elif Platelets <= 50:
            Platelets_score = 3
        elif Platelets <= 100:
            Platelets_score = 2
        elif Platelets <= 150:
            Platelets_score = 1
        else:
            Platelets_score = 0
        scores[ii, 6] = Platelets_score

        Bilirubin = patient_data[ii, 25]
        if Bilirubin == np.nan:
            Bilirubin_score = np.nan
        elif Bilirubin < 1.2:
            Bilirubin_score = 0
        elif Bilirubin < 2:
            Bilirubin_score = 1
        elif Bilirubin < 6:
            Bilirubin_score = 2
        else:
            Bilirubin_score = 3
        scores[ii, 7] = Bilirubin_score

    return scores


def extract_features(patient_data, columns_to_drop=[]):
    # Get the column with Sepsis Label as it is not the same for each row (check documentation)
    labels = np.array(patient_data["SepsisLabel"])
    patient_data = patient_data.drop(columns=columns_to_drop)

    # Gets information from the missing variables
    # This can be useful as it shows the clinical judgment, the test has not been ordered
    #                              (probably a good decision we should take into account)
    temp_data = feature_missing_information(patient_data, sep_col + con_col)
    temp = pd.DataFrame(temp_data)
    # To complete the data use forward-filling strategy
    temp = temp.fillna(method="ffill")
    # These are also the first set of features
    # In this configutation 99 (66 + 33 or 3 per column) features to be precise
    # They are also time indifferent
    features_A = np.array(temp)
    # The team did not use DBP, not sure why, might investigate this
    # columns = ['HR', 'O2Sat', 'SBP', 'MAP', 'Resp', 'DBP']

    # six-hour slide window statistics of selected columns
    columns = ["HR", "O2Sat", "SBP", "MAP", "Resp"]
    features_B = feature_slide_window(patient_data, columns)

    # Score features based according to NEWS, SOFA and qSOFA
    features_C = features_score(features_A)

    features = np.column_stack([features_A, features_B, features_C])

    return features, labels


# Data Pre-processing
def preprecess_data(dataset, patient_id_map=None):
    frames_features = []
    frames_labels = []

    for patient_id in set(dataset.index.get_level_values(0)):
        if patient_id_map is not None:
            print(
                f"Processing data for patient ID: {patient_id}, File: {patient_id_map[patient_id]}",
                end="\r",
            )

        patient_data = dataset.loc[patient_id]

        features, labels = extract_features(patient_data)
        features = pd.DataFrame(features)
        labels = pd.DataFrame(labels)

        frames_features.append(features)
        frames_labels.append(labels)

    data_features = np.array(pd.concat(frames_features))
    data_labels = (np.array(pd.concat(frames_labels)))[:, 0]

    # Randomly shuffle the data
    index = [i for i in range(len(data_labels))]
    np.random.shuffle(index)
    data_features = data_features[index]
    data_labels = data_labels[index]

    return data_features, data_labels



In [7]:

def get_dataset():
    data = []
    patient_ids = np.array([])
    file_listA = os.listdir(DATA_PATH / 'training_setA')
    file_listB = os.listdir(DATA_PATH / 'training_setB')
    patient_counter = 1  # Start counter for patient IDs
    patient_id_map = {}  # Map to store counter to filename mapping
    
    # Process files in training_setA
    for file_name in file_listA:
        file_path = os.path.join(DATA_PATH, 'training_setA', file_name)
        df_temp = pd.read_csv(file_path, sep='|')
        patient_ids = np.append(patient_ids, [patient_counter] * len(df_temp))  # Use counter as patient ID
        # patient_ids.extends([patient_counter] * len(df_temp))  # Use counter as patient ID
        data.append(df_temp)
        patient_id_map[patient_counter] = file_name  # Map counter to filename
        patient_counter += 1  # Increment counter for the next patient
        print("  ", patient_counter, end='\r')
    print("  ", patient_counter)

    # Process files in training_setB
    for file_name in file_listB:
        file_path = os.path.join(DATA_PATH, 'training_setB', file_name)
        df_temp = pd.read_csv(file_path, sep='|')
        patient_ids = np.append(patient_ids, [patient_counter] * len(df_temp))
        # patient_ids.extend([patient_counter] * len(df_temp))  # Use counter as patient ID
        data.append(df_temp)
        patient_id_map[patient_counter] = file_name  # Map counter to filename
        patient_counter += 1  # Increment counter for the next patient
        print("  ", patient_counter, end='\r')
    print("  ", patient_counter)

    combined_df = pd.concat(data, ignore_index=True)
    combined_df['patient_id'] = patient_ids
    combined_df.set_index(['patient_id', combined_df.index], inplace=True)
    
    print("Dataset loaded into a MultiIndex DataFrame.")
    return combined_df, patient_id_map

In [8]:
def impute_linear_interpolation(data, column_name):
    """
    Imputes missing values using linear interpolation.

    Parameters:
    - data (pd.DataFrame): The data frame containing the column values.
    - column_name (str): The name of the column to impute.

    Returns:
    - pd.DataFrame: DataFrame with missing values imputed using linear interpolation.
    """
    imputed_data = data
    imputed_data[column_name] = data[column_name].interpolate(method='linear')
    return imputed_data

In [9]:



dataset, patient_id_map = get_dataset()

print("Dataset initially: ", dataset.shape)

downsampling = False
if downsampling:
    sepsis_groups = dataset.groupby(level="patient_id")["SepsisLabel"].max()
    patients_sepsis = sepsis_groups[sepsis_groups == 1].index
    patients_no_sepsis = sepsis_groups[sepsis_groups == 0].index
    min_size = len(patients_sepsis)
    sampled_no_sepsis = np.random.choice(
        patients_no_sepsis, min_size, replace=False
    )
    dataset = dataset.loc[np.concatenate([patients_sepsis, sampled_no_sepsis])]
    print("Dataset after downsampling: ", dataset.shape)


columns_to_linearly_interpolate = ["HR", "O2Sat", "SBP", "MAP", "DBP", "Resp"]


# Linear Interpolation
print("Linearly interpolating:")
for col in columns_to_linearly_interpolate:
    if col != "SepsisLabel":  # Ensure we do not interpolate 'SepsisLabel'
        dataset = impute_linear_interpolation(dataset, col)
        print(col)
print("Done")

# ___________________________________________________________________________________________________________________________________

# ___________________________________________________________________________________________________________________________________

def add_nan_indicators(df):
    for column in df.columns:
        df[column + "_nan"] = df[column].isna().astype(int)
    return df

def downsample(X, y):
    index_0 = np.where(y == 0)[0]
    index_1 = np.where(y == 1)[0]
    print(index_0, index_1)

    if len(index_0) > len(index_1):
        index_0 = np.random.choice(index_0, size=len(index_1), replace=False)

    balanced_indices = np.concatenate([index_0, index_1])
    np.random.shuffle(balanced_indices)

    x_balanced = X.iloc[balanced_indices]
    y_balanced = y.iloc[balanced_indices]

    return x_balanced, y_balanced

# ___________________________________________________________________________________________________________________________________

feature_engineer = True
if feature_engineer:
    X = add_nan_indicators(dataset)
    XX, y = preprecess_data(X)
    new_feature_names = [f"new_feature_{i}" for i in range(XX.shape[1])]
    XX_df = pd.DataFrame(XX, columns=new_feature_names, index=X.index)

    # Concatenate the new features from XX_df back to the original DataFrame X
    X = pd.concat([X, XX_df], axis=1)
    y = dataset["SepsisLabel"]

else:
    X = dataset.drop("SepsisLabel", axis=1)
    X = add_nan_indicators(X)
    y = dataset["SepsisLabel"]

# just in case
dataset *= 0

print("Seeing if there are still any nan values or +/- infinities")
# Just trying to fix some errors I got only on a GPU
# if X.isin([np.nan, np.inf, -np.inf]).any().any():
#     print("Data contains NaN or infinite values. Handling...")
#     X.replace([np.inf, -np.inf], np.nan, inplace=True)
#     X.fillna(method='ffill', inplace=True)
if X.isin([np.nan, np.inf, -np.inf]).any().any():
    print("Data contains NaN or infinite values. Handling...")
    # Replace infinite values with NaN so they can be filled too
    X.replace([np.inf, -np.inf], np.nan, inplace=True)

    # First apply forward fill
    X.fillna(method="ffill", inplace=True)
    # Then apply backward fill for any remaining NaNs
    X.fillna(method="bfill", inplace=True)

# Ensure no NaNs or infinities in the target variable as well
if y.isin([np.nan, np.inf, -np.inf]).any():
    print("Target contains NaN or infinite values. Handling...")
    y.replace([np.inf, -np.inf], np.nan, inplace=True)
    y.fillna(method="ffill", inplace=True)


   20337
   40337
Dataset loaded into a MultiIndex DataFrame.
Dataset initially:  (1552210, 41)
Linearly interpolating:
HR
O2Sat
SBP
MAP
DBP
Resp
Done
Seeing if there are still any nan values or +/- infinities
Data contains NaN or infinite values. Handling...


In [10]:
df_save = X.copy()
df_save["SepsisLabel"] = y
#reset index   
df_save.reset_index(drop=False, inplace=True)
df_save.to_parquet(SAVED_DATA_PATH)
print(f"Dataset with labels saved to {SAVED_DATA_PATH}")

Dataset with labels saved to /Users/aidend/Developer/ds_uni/dl-sepsis-prediction/dataset/dsv4.parquet


In [11]:
print(df_save.shape)

(1552210, 303)


In [12]:
df_save.head()

Unnamed: 0,patient_id,level_1,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,new_feature_209,new_feature_210,new_feature_211,new_feature_212,new_feature_213,new_feature_214,new_feature_215,new_feature_216,new_feature_217,new_feature_218
0,1.0,0,80.0,100.0,36.5,121.0,58.0,41.0,13.5,34.0,...,1.298503,1.486046,0.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0
1,1.0,1,76.0,100.0,36.25,113.25,61.0,41.5,12.0,34.0,...,3.029026,4.732864,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0
2,1.0,2,80.0,100.0,36.25,132.75,71.5,46.25,12.0,34.0,...,3.430258,5.609516,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0
3,1.0,3,78.0,100.0,36.1,103.5,58.0,43.0,12.0,34.0,...,1.974842,2.875181,0.0,1.0,0.0,3.0,0.0,0.0,0.0,3.0
4,1.0,4,74.0,100.0,36.0,128.75,69.5,44.5,12.5,34.0,...,2.167948,1.264911,1.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0


In [14]:
# print unique data types
print(df_save.dtypes.unique())
# print unique values
# print(df_save.nunique())


[dtype('float64') dtype('int64')]


In [18]:
# save the columns to a json
columns_json = df_save.columns.tolist()
with open('columns.json', 'w') as f:
    json.dump(columns_json, f)


In [19]:
df_save["Gender"]

0          1
1          1
2          1
3          1
4          1
          ..
1552205    0
1552206    0
1552207    0
1552208    0
1552209    0
Name: Gender, Length: 1552210, dtype: int64

In [22]:
df_save["new_feature_102"].unique()

array([ 5.000e+00,  1.000e+00,  0.000e+00, ..., -9.090e+02, -4.120e+02,
       -1.784e+03])