In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

seed = 9973

# Load the dataset and drop unnecessary columns
data = pd.read_csv('mimiciii_sepsis.csv')
data.drop(['Unnamed: 0', 'SUBJECT_ID', 'HADM_ID'], axis=1, inplace=True)

# Convert gender to numeric values: 1 for Male and 0 for Female
data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'M' else 0)

X = data.drop(['28 Day Death', 'In Hospital Death'], axis=1)
y = data['28 Day Death']

# Convert all columns to numeric and fill missing values with column mean
for column in X.columns:
    X[column] = pd.to_numeric(X[column], errors='coerce').fillna(X[column].mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

dataset_description = {
    'Dataset Size': len(data),
    '# Features': len(X.columns),
    'Label Distribution': y.value_counts(normalize=True),
    'Missing Values': data.isnull().sum().sum(),
}

print(dataset_description)

{'Dataset Size': 4555, '# Features': 113, 'Label Distribution': 28 Day Death
0    0.6191
1    0.3809
Name: proportion, dtype: float64, 'Missing Values': 169592}


In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Initialize and apply Iterative Imputer
imputer = IterativeImputer(max_iter=10, random_state=seed)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X.columns)

# Initialize and apply MinMaxScaler and SelectKBest
scaler = MinMaxScaler().fit(X_train)
kbest = SelectKBest(chi2, k=10).fit(scaler.transform(X_train), y_train)

# Transform train and test sets
X_train = pd.DataFrame(kbest.transform(scaler.transform(X_train)), columns=X.columns[kbest.get_support()])
X_test = pd.DataFrame(kbest.transform(scaler.transform(X_test)), columns=X.columns[kbest.get_support()])

# Reset indices of target variables and insert them into datasets
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
X_train.insert(0, '28 Day Death', y_train)
X_test.insert(0, '28 Day Death', y_test)

# Save datasets to CSV files
X_train.to_csv('data_train.csv', index=False)
X_test.to_csv('data_test.csv', index=False)