DATA IMPUATATION with RF, KNN

In [10]:
import numpy as np
import pandas as pd
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
import math
import matplotlib.pyplot as plt

In [11]:
data_path = "../../data/"
raw_data = data_path + "new_data2.csv"
additional_data = data_path + 'Preprocessed_AIS_ISS_scores.csv'
helsinkiScore_file =  data_path + '_neurobot_data_imaging_classification_score_sex.csv'
# target_path = "../../../data_preparation/biomarkers_acute_sample_462/"
# target_file = 'master_accute_sample_444.xlsx'
save_path = '../../data_preparation/biomarkers_acute_sample_462/'
biomarkers = ['GFAP', 'NFL', 'Tau', 'UCH-L1', 'S100B', 'NSE']


file_name = 'master_acute_sample_462_X7_helsinki.xlsx' # master excel file

In [12]:
# Fill zero and empty cell = NaN
# correct S100B range
# Extract biomarker in dfx 
df = pd.read_csv(raw_data, header = 0, index_col='subjectId')

df1 = pd.read_csv(additional_data, header=0, index_col=0)
print(df1.columns)
df1 = df1.fillna(0)
df2 = pd.read_csv(helsinkiScore_file, header=0)
df2h = df2[['subjectId','Imaging.HelsinkiCTScore']].drop_duplicates()
df2h.set_index('subjectId', inplace=True)

helsinki_indices = df2h[df2h['Imaging.HelsinkiCTScore'].notnull()].index
helsinki_ICU_indices = helsinki_indices.intersection(df.index)
#Select only sample with Helsinki score is not null
df = df.loc[helsinki_ICU_indices]
print(df.shape)


df[df1.columns] = df1
df['Sex'] = df['Sex'].map({'F' : 0, 'M' : 1})
fix_cols = ['Age', 'Sex', 'GCS', 'AIS.InjBodyRegion_1', 'AIS.InjBodyRegion_2', 'AIS.InjBodyRegion_3', 
       'AIS.InjBodyRegion_4', 'AIS.InjBodyRegion_5', 'AIS.InjBodyRegion_6', 'AIS.InjBodyRegion_7', 'AIS.InjBodyRegion_8', 'AIS.InjBodyRegion_9',
       'AIS.InjBodyRegion_10', 'AIS.InjBodyRegion_11', 'AIS.InjBodyRegion_12', 'InjuryHx.TotalISS',
       'MarshallCT', 'Fisher', 'MorrisMarshall', 'RotterdamCT']
dfx = {}    # extracted df by biomarker
for bio in biomarkers:
    bio_cols = fix_cols + [i for i in df.columns if i.startswith(bio)]
    dfx[bio] = df[bio_cols]
    dfx[bio] = dfx[bio].apply(pd.to_numeric, errors='coerce')
    dfx[bio].fillna(np.nan, inplace=True)
    if bio == 'S100B':
        S100B_cols = [i for i in df.columns if i.startswith('S100B')]
        dfx[bio][S100B_cols] = dfx[bio][S100B_cols].where(dfx[bio][S100B_cols] < 1000, dfx[bio][S100B_cols] / 1000)
    print(dfx[bio].columns, dfx[bio].shape)


Index(['AIS.InjBodyRegion_1', 'AIS.InjBodyRegion_2', 'AIS.InjBodyRegion_3',
       'AIS.InjBodyRegion_4', 'AIS.InjBodyRegion_5', 'AIS.InjBodyRegion_6',
       'AIS.InjBodyRegion_7', 'AIS.InjBodyRegion_8', 'AIS.InjBodyRegion_9',
       'AIS.InjBodyRegion_10', 'AIS.InjBodyRegion_11', 'AIS.InjBodyRegion_12',
       'InjuryHx.TotalISS'],
      dtype='object')
(373, 49)
Index(['Age', 'Sex', 'GCS', 'AIS.InjBodyRegion_1', 'AIS.InjBodyRegion_2',
       'AIS.InjBodyRegion_3', 'AIS.InjBodyRegion_4', 'AIS.InjBodyRegion_5',
       'AIS.InjBodyRegion_6', 'AIS.InjBodyRegion_7', 'AIS.InjBodyRegion_8',
       'AIS.InjBodyRegion_9', 'AIS.InjBodyRegion_10', 'AIS.InjBodyRegion_11',
       'AIS.InjBodyRegion_12', 'InjuryHx.TotalISS', 'MarshallCT', 'Fisher',
       'MorrisMarshall', 'RotterdamCT', 'GFAP_1', 'GFAP_2', 'GFAP_3', 'GFAP_4',
       'GFAP_5'],
      dtype='object') (373, 25)
Index(['Age', 'Sex', 'GCS', 'AIS.InjBodyRegion_1', 'AIS.InjBodyRegion_2',
       'AIS.InjBodyRegion_3', 'AIS.InjBodyRegion

In [3]:
# Random Forest regressor
def impute_and_normalize(df):
    # measurement_columns = ['d1', 'd2', 'd3', 'd4', 'd5']
    biomarker_data = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
    
    # Impute missing data
    imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=20, random_state=42)
    biomarker_data_imputed = pd.DataFrame(imputer.fit_transform(biomarker_data), columns=biomarker_data.columns)
    
    # Normalize the data to range 0-1
    scaler = MinMaxScaler()
    biomarker_data_normalized = pd.DataFrame(scaler.fit_transform(biomarker_data_imputed), columns=biomarker_data_imputed.columns)
    
    return biomarker_data_normalized


In [13]:
# KNN imputation d1-d5
def impute_KNN(df,neighbors=3):
    imputer = KNNImputer(n_neighbors=neighbors)
    dfimputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)
    return dfimputed
def impute_RF(df):
    imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=30, random_state=42)
    dfimputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)
    return dfimputed
def normalize(df):
    scaler = MinMaxScaler()
    dfnorm = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    return dfnorm

In [14]:

os.makedirs(save_path, exist_ok=True)

# Save all DataFrames to a single Excel file
full_path = os.path.join(save_path, file_name)

df_RF = {}  # To store the imputed and normalized data frames
df_KNN = {}
dfn_RF = {}
dfn_KNN = {}

dfs = {}
for biomarker in biomarkers:

    df_KNN[biomarker] = impute_KNN(dfx[biomarker], neighbors=3)

    




In [15]:
with pd.ExcelWriter(full_path, engine='xlsxwriter') as writer:
    for biomarker in biomarkers:
        # biomarker_columns = ['subjectId_x'] + [col for col in df.columns if col.startswith(biomarker)]
        dfx[biomarker].to_excel(writer, sheet_name=f'{biomarker}', header=True, index=True)
        
        df_KNN[biomarker].to_excel(writer, sheet_name=f'{biomarker} KNN imputed', header=True, index=True)
        # dfn_KNN[biomarker].to_excel(writer, sheet_name=f'{biomarker} KNN imputed normalized', header=True, index=True)
        # df_KNN[biomarker].iloc[:, -5:].replace(0, 0.001, inplace=True)
        dflog = df_KNN[biomarker].iloc[:, :-5]
        dflog1 = df_KNN[biomarker].iloc[:, -5:].replace(0, 0.001)
        dflog2 = np.log(dflog1)
        dflog3 = normalize(dflog2)
        # dflog1.replace(0, 0.001, inplace=True)
        dflog[df_KNN[biomarker].columns[-5:]] = dflog3
        dflog.to_excel(writer, sheet_name=f'{biomarker} KNN imputed normalized', header=True, index=True)

print(f"File saved successfully at {full_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dflog[df_KNN[biomarker].columns[-5:]] = dflog3
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dflog[df_KNN[biomarker].columns[-5:]] = dflog3
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dflog[df_KNN[biomarker].columns[-5:]] = dflog3
A value is trying to be set on a copy of a slice from a DataFram

File saved successfully at ../../data_preparation/biomarkers_acute_sample_462/master_acute_sample_462_X7_helsinki.xlsx
