In [2]:
import torch
import torch.nn as nn
import utility_functions as uf
import pandas as pd
import numpy as np
import importlib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.options.display.max_columns = None
importlib.reload(uf)

<module 'utility_functions' from 'c:\\Users\\emilw\\Assignment3\\sepsis-ts\\SepsisTS\\utility_functions.py'>

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# Loading the data manually
data1 = pd.read_table("raw_data/sepsisexp_timeseries_partition-A.tsv")
data2 = pd.read_table("raw_data/sepsisexp_timeseries_partition-B.tsv")
data3 = pd.read_table("raw_data/sepsisexp_timeseries_partition-C.tsv")
data4 = pd.read_table("raw_data/sepsisexp_timeseries_partition-D.tsv")

In [6]:
# Setting partitions, original authors used these for cross validation. 
# The dataset is split into four partitions (A–D) that are used in the 4-fold cross validation experiments in (Schamoni et al., 2022). 
data1["partition"] = 1
data2["partition"] = 2
data3["partition"] = 3
data4["partition"] = 4

In [7]:
data_orig = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [8]:
df = data_orig.copy()

In [9]:
#Removes rows for each patient after severity>=2 as we want to predict before the patient get sepsis. 
#Also marks the last (input hour) rows as targets.

def setSepsisColumn(df_input, colName, hours):
    hours = hours * 2
    df_input[colName] = 0
    df_result = df_input[df_input["severity"] < 2]

    for pid in df_result["id"].unique():
        last_n_per_id = df_result[df_result["id"]==pid].tail(hours).index
        #print(last_n_per_id)
        df_result.loc[last_n_per_id, colName] = 1

    return df_result



df_test = df.copy()
df_test = setSepsisColumn(df_test, "2hourSepsis", 2)
df_test = setSepsisColumn(df_test, "4hourSepsis", 4)
df_test = setSepsisColumn(df_test, "6hourSepsis", 6)

In [10]:
df_prep = df_test.copy()

In [11]:
cols_to_drop = ["severity",'timestep',]
df_prep.drop(columns=cols_to_drop, inplace=True)

In [12]:
df_2hr = df_prep.drop(["4hourSepsis", "6hourSepsis", ], axis=1)
df_4hr = df_prep.drop(["2hourSepsis", "6hourSepsis", ], axis=1)
df_6hr = df_prep.drop(["2hourSepsis", "4hourSepsis", ], axis=1)
df_6hr

Unnamed: 0,id,sepsis,respiratory_minute_volume,heart_rate,leukocytes,temperature,partial_co2,respiratory_rate,arterial_ph,bilirubin,blood_urea_nitrogen,creatinine,diastolic_bp,fraction_of_inspired_o2,mean_bp,partial_pressure_art._o2,systolic_bp,thrombocytes,horowitz_index,bun/creatinine_ratio,delta-temperature,lactate,bicarbonate,c-reactive_protein,hemoglobin,heart_time_volume,lymphocytes,sodium,pancreatic_lipase,procalcitonin,quick_score,oxygen_saturation,blood_glucose,base_excess,chloride,calcium,potassium,mixed_venous_oxygen_saturation,urine_output,net balance,alanine_transaminase,aspartate_transaminase,stroke_volume,svri,age,partition,6hourSepsis
0,12292,0,0.190898,0.424464,0.301015,-0.168117,-0.275272,1.879692,-0.041447,0.179544,1.445381,2.396762,-0.239052,1.479290,-0.300612,-0.005231,-0.320473,1.114980,-0.938897,-0.740976,-1.239555,-0.268699,-0.654281,0.416335,1.856485,-0.867504,-0.811566,-0.248615,0.781767,-0.349756,-0.184841,-0.782330,-0.914517,-1.152851,-0.517229,1.019083,-0.334653,0.010733,-0.710447,1.259337,-0.023852,0.117472,0.317126,0.061715,0.371047,1,0
1,12292,0,0.157654,0.667394,0.301015,-0.168117,-0.275272,1.708485,-0.041447,0.179544,1.445381,2.396762,-0.936105,0.573504,-1.329784,-0.005231,-1.485067,1.114980,-0.562761,-0.740976,-1.239555,-0.268699,-0.654281,0.416335,1.856485,-0.867504,-0.811566,-0.248615,0.781767,-0.349756,-0.184841,-0.782330,-0.914517,-1.152851,-0.517229,1.019083,-0.334653,0.010733,-0.710447,1.259337,-0.023852,0.117472,0.317126,0.061715,0.371047,1,0
2,12292,0,0.024678,0.618808,0.301015,-0.732387,1.003408,2.050899,1.804109,0.179544,1.445381,2.396762,-1.424042,1.026397,-1.615666,-1.310985,-1.368608,1.114980,-1.498923,-0.740976,-0.377821,-0.171096,4.114980,0.416335,-0.884218,-0.867504,-0.811566,-0.010212,0.781767,-0.349756,-0.184841,-2.050742,0.401916,5.255039,-3.242374,-0.868157,2.923028,0.010733,-0.710447,1.259337,-0.023852,0.117472,0.317126,0.061715,0.371047,1,0
3,12292,0,-0.208030,0.278706,0.301015,-0.732387,1.003408,1.366071,1.804109,0.179544,1.445381,2.396762,-1.284632,6.008223,-1.444137,-1.310985,-1.368608,1.114980,-2.018409,-0.740976,-0.377821,-0.171096,4.114980,0.416335,-0.884218,-0.867504,-0.811566,-0.010212,0.781767,-0.349756,-0.184841,-2.050742,0.401916,5.255039,-3.242374,-0.868157,2.923028,0.010733,-0.710447,1.259337,-0.023852,0.117472,0.317126,0.061715,0.371047,1,0
4,12292,0,-0.108298,-0.352912,0.301015,-0.732387,1.094023,1.537278,1.677315,0.179544,1.445381,2.396762,-1.284632,1.116975,-1.386961,-0.977793,-1.368608,1.114980,-1.337929,-0.740976,-0.377821,-0.268699,4.092377,0.416335,-0.884218,-0.867504,-0.811566,-0.010212,0.781767,-0.349756,-0.184841,-1.733639,0.255646,5.165209,-3.128827,-0.868157,2.719423,0.010733,-0.710447,0.452120,-0.023852,0.117472,0.317126,0.061715,0.371047,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602563,13304,1,1.022000,-0.255740,0.500221,0.057592,0.459717,1.023658,0.423464,-0.451520,-0.063366,-0.629891,-0.308758,-1.056912,0.499856,-1.587144,1.154679,1.242303,-0.826056,0.703425,-0.894861,-0.463905,1.266985,0.468437,0.454265,0.024764,0.223144,-0.248615,-0.186168,-0.391198,0.047367,0.803185,-0.475706,1.033018,-0.971420,0.621769,0.683373,0.753742,0.087333,-0.429313,-0.136872,-0.483745,0.499599,-0.350631,0.950335,4,1
602564,13304,1,1.022000,-0.401498,0.500221,0.057592,-0.275272,1.023658,0.874287,-0.451520,-0.063366,-0.629891,-0.657284,-1.056912,0.328327,0.054803,1.115859,1.242303,1.044368,0.703425,-0.894861,-0.561509,0.634098,0.468437,0.135578,0.024764,0.223144,-0.248615,-0.186168,-0.391198,0.047367,0.803185,-0.573219,0.344320,-0.630777,0.224456,0.072557,0.753742,-0.710447,-0.413381,-0.136872,-0.483745,0.499599,-0.350631,0.950335,4,1
602565,13304,1,1.022000,-0.255740,0.500221,-0.055263,-0.275272,1.023658,0.874287,-0.451520,-0.063366,-0.629891,-0.099642,-1.056912,0.614208,0.054803,1.154679,1.242303,1.044368,0.703425,-1.067208,-0.561509,0.634098,0.468437,0.135578,0.024764,0.223144,-0.248615,-0.186168,-0.391198,0.047367,0.803185,-0.573219,0.344320,-0.630777,0.224456,0.072557,0.753742,-0.710447,-0.413381,-0.136872,-0.483745,0.499599,-0.350631,0.950335,4,1
602566,13304,1,1.022000,0.084362,0.500221,-0.055263,-0.275272,1.023658,0.874287,-0.451520,-0.063366,-0.629891,-0.308758,-1.056912,0.385503,0.054803,1.038219,1.242303,1.044368,0.703425,-1.067208,-0.561509,0.634098,0.468437,0.135578,0.024764,0.223144,-0.248615,-0.186168,-0.391198,0.047367,0.168979,-0.573219,0.344320,-0.630777,0.224456,0.072557,0.753742,0.246889,-0.431570,-0.136872,-0.483745,0.499599,-0.350631,0.950335,4,1


In [13]:
# Get the timeseries length of the patient who gets sepsis with least timeseries data. Used as undersampling.

max_len = min(df_2hr[df_2hr["sepsis"] == 1].groupby('id').size())
max_len

group_size = df_2hr[df_2hr["sepsis"] == 1].groupby('id').size()
max_size = group_size.max()
ids_with_max_rows = group_size[group_size == max_size].index.tolist()
print(f"ID(s) with sepsis with the maximum number of rows: {ids_with_max_rows}, Length: {max_len}")

ID(s) with sepsis with the maximum number of rows: [13702], Length: 97


In [14]:
df_2hr_dropped = df_6hr.drop(columns=["sepsis"])

features = df_2hr_dropped.columns[:-1] #all but last column as features
target_column = df_2hr_dropped.columns[-1] #last column as target
print(target_column)
  #target_column error here <---
# Number of features, excluding the 'id' column and last column (target)
num_features = len(features) - 1  # Excluding 'id' column 
#num_features = df_2hr_dropped.drop(columns=["id"], axis=1).shape[1] #old with target included


# Pre-allocate a 3D array for features and 2D array for target
num_sequences = df_2hr_dropped['id'].nunique()
print(num_sequences)
data_3d = np.zeros((num_sequences, max_len, num_features))
target_2d = np.zeros((num_sequences, max_len))
print(data_3d.shape)
print(target_2d.shape)

print("features:",features)

# Assuming you have defined 'max_len', 'data_3d', 'target_2d', and 'df_2hr_dropped' correctly
for i, (pid, group) in enumerate(df_2hr_dropped.groupby('id')):
    # Drop 'id' column and select only the relevant features for this group
    features_data = group[features[1:]]  # Assuming 'features[1:]' skips 'id'
    # Ensure we only take the last 'max_len' rows if this patient has more data than 'max_len'
    if len(group) > max_len:
        features_data = features_data.iloc[-max_len:]  # Select last 'max_len' rows
        target_data = group[target_column].iloc[-max_len:]
    else:
        target_data = group[target_column]
    
    # Calculate starting index if we have fewer than 'max_len' data points
    start_idx = max_len - len(features_data)
    
    # Fill the data into our pre-allocated arrays
    data_3d[i, start_idx:, :] = features_data.to_numpy()
    target_2d[i, start_idx:] = target_data.to_numpy()

# Verify shapes and some contents
print(f"data_3d shape: {data_3d.shape}, target_2d shape: {target_2d.shape}")

6hourSepsis
1275
(1275, 97, 44)
(1275, 97)
features: Index(['id', 'respiratory_minute_volume', 'heart_rate', 'leukocytes',
       'temperature', 'partial_co2', 'respiratory_rate', 'arterial_ph',
       'bilirubin', 'blood_urea_nitrogen', 'creatinine', 'diastolic_bp',
       'fraction_of_inspired_o2', 'mean_bp', 'partial_pressure_art._o2',
       'systolic_bp', 'thrombocytes', 'horowitz_index', 'bun/creatinine_ratio',
       'delta-temperature', 'lactate', 'bicarbonate', 'c-reactive_protein',
       'hemoglobin', 'heart_time_volume', 'lymphocytes', 'sodium',
       'pancreatic_lipase', 'procalcitonin', 'quick_score',
       'oxygen_saturation', 'blood_glucose', 'base_excess', 'chloride',
       'calcium', 'potassium', 'mixed_venous_oxygen_saturation',
       'urine_output', 'net balance', 'alanine_transaminase',
       'aspartate_transaminase', 'stroke_volume', 'svri', 'age', 'partition'],
      dtype='object')
data_3d shape: (1275, 97, 44), target_2d shape: (1275, 97)


In [18]:
print(features_data)
print(target_data)

       respiratory_minute_volume  heart_rate  leukocytes  temperature  \
54435                  -0.008566   -1.276046    0.833223    -0.958095   
54436                  -0.008566   -1.324632    0.833223    -0.845241   
54437                  -0.008566   -0.158568    0.833223    -0.958095   
54438                  -0.008566   -0.450084    0.833223    -0.958095   
54439                  -0.008566   -2.053422    0.833223    -0.845241   
...                          ...         ...         ...          ...   
54527                   0.224143   -0.595842    0.436297    -0.732387   
54528                   0.190898   -0.595842    0.436297    -0.732387   
54529                   0.423607   -0.498670    0.436297    -0.845241   
54530                   0.490095   -0.887358    0.436297    -0.958095   
54531                   0.157654   -0.935944    0.436297    -0.845241   

       partial_co2  respiratory_rate  arterial_ph  bilirubin  \
54435    -0.627664         -0.859618     0.550258  -0.49884