In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [2]:
#GLOBAL VARIABLES
# Define the programmer whose path you want to use
programmer = 'esther'

#variables for paths names
file_paths_names = {
    'patricia': r"..\original_data",
    'esther': r"C:\Users\egh22\OneDrive - University of Canterbury\bootcamp\group project\data"
}

files_outcome_path_names = {
    'patricia': r"..\original_data\outcomes",
    'esther': r"C:\Users\egh22\OneDrive - University of Canterbury\bootcamp\group project\data\outcomes"
}


files_path_to_save_files = {
    'patricia': r"..\clean_data",
    'esther': r"C:\Users\egh22\OneDrive - University of Canterbury\bootcamp\group project\clean_data"
}

# Choose the file path based on the programmer variable
file_path_to_save = files_path_to_save_files[programmer]

<b> 1 - Data importation </b>

In [3]:
#features importation
initial_df = pd.read_csv(os.path.join(file_path_to_save, 'clean_dataframe_with_NaN.csv'))

In [12]:
#describe the missing value distribution. 
initial_df.isna().sum().describe(percentiles=[.75, .85, .99])

count      99.000000
mean     1016.131313
std      1106.144865
min         0.000000
50%       299.000000
75%      1966.000000
85%      2409.500000
99%      3606.500000
max      3827.000000
dtype: float64

In [4]:
#outcomes importation
files_outcome_path =  files_outcome_path_names[programmer]
df_target_a = pd.read_csv(os.path.join(files_outcome_path, "Outcomes-a.txt"))
df_target_b = pd.read_csv(os.path.join(files_outcome_path, "Outcomes-b.txt"))

outcomes_df = pd.concat([df_target_a, df_target_b], ignore_index=True).drop(columns=['SAPS-I', 'SOFA', 'Survival'])


<b> METHOD 1 - KNNImputer (mputation for completing missing values using k-Nearest Neighbors)

In [5]:
# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=200)

# Impute missing values in initial_df
imputed_data = imputer.fit_transform(initial_df)

# Convert the result back to a DataFrame
KNN_imputed_df = pd.DataFrame(imputed_data, columns=initial_df.columns)

In [6]:
#sannity check
KNN_imputed_df.isna().sum()

RecordID           0
Age                0
Gender             0
ICUType            0
BMI                0
                  ..
WBC_median         0
WBC_mad            0
WBC_slope          0
Weight_24median    0
Weight_mad         0
Length: 99, dtype: int64

merge the dataframe with the outcoms for machine learning

In [30]:
# merge the imputed dataframe with the outcomes on each patient ID
KNN_imputed_df = pd.merge(KNN_imputed_df, outcomes_df, on='RecordID')

In [31]:
# save the new dataframe
KNN_imputed_df.to_csv(os.path.join(file_path_to_save, "KNN_imputed_df.csv"), index=False)

variables of KNNImputer:
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html
Parameters:
missing_valuesint, float, str, np.nan or None, default=np.nan
The placeholder for the missing values. All occurrences of missing_values will be imputed. For pandas’ dataframes with nullable integer dtypes with missing values, missing_values should be set to np.nan, since pd.NA will be converted to np.nan.

n_neighborsint, default=5
Number of neighboring samples to use for imputation.

weights{‘uniform’, ‘distance’} or callable, default=’uniform’
Weight function used in prediction. Possible values:

‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

callable : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

metric{‘nan_euclidean’} or callable, default=’nan_euclidean’
Distance metric for searching neighbors. Possible values:

‘nan_euclidean’

callable : a user-defined function which conforms to the definition of _pairwise_callable(X, Y, metric, **kwds). The function accepts two arrays, X and Y, and a missing_values keyword in kwds and returns a scalar distance value.

copybool, default=True
If True, a copy of X will be created. If False, imputation will be done in-place whenever possible.

add_indicatorbool, default=False
If True, a MissingIndicator transform will stack onto the output of the imputer’s transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won’t appear on the missing indicator even if there are missing values at transform/test time.

keep_empty_featuresbool, default=False
If True, features that consist exclusively of missing values when fit is called are returned in results when transform is called. The imputed value is always 0.

<b> METHOD 2 - Imputing NaN values with -1

In [13]:
# Impute all missing values with -1
minus1_imputed_df = initial_df.fillna(-1)

# Check if there are any remaining missing values
print("Number of missing values after imputation:", minus1_imputed_df.isna().sum().sum())


Number of missing values after imputation: 0


In [None]:
# save the new dataframe
minus1_imputed_df.to_csv(os.path.join(file_path_to_save, "minus1_imputed_df.csv"), index=False)