### RANDOM FOREST IMPUTATION

#### Import Libraries and Start Timing

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer  # Enabling iterative imputer
from sklearn.impute import IterativeImputer
import time

# Start timing the imputation process
start_time = time.time()

#### Load and Preprocess Data

In [None]:
# Load the dataset
merged_df_NEW = pd.read_csv('merged_df_NEW.csv')

# Convert datetime columns to numeric (timestamp) for imputation
merged_df_NEW['admittime'] = pd.to_datetime(merged_df_NEW['admittime'])
merged_df_NEW['admittime'] = merged_df_NEW['admittime'].astype(int) / 10**9  # Convert datetime to seconds since epoch

# Clean column names to remove special characters
cleaned_columns = [re.sub(r'\W+', '_', col) for col in merged_df_NEW.columns]
merged_df_NEW.columns = cleaned_columns

#### Separate Columns for Imputation

In [None]:
# Columns to exclude from imputation
exclude_columns = ['survival_time', 'deathtime', 'LOS']
# Ensure exclude_columns exist in the DataFrame
exclude_columns = [col for col in exclude_columns if col in merged_df_NEW.columns]

# Separate columns to impute (numerical columns) and columns to exclude
columns_to_impute = merged_df_NEW.drop(columns=exclude_columns).select_dtypes(include=[np.number]).columns
data_to_impute = merged_df_NEW[columns_to_impute]

#### Define Imputer and Perform Imputation

In [None]:
# Define the random forest model to be used for imputation
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# Create an IterativeImputer using the defined RandomForestRegressor
imputer = IterativeImputer(estimator=rf_model, max_iter=10, random_state=0)

# Perform the imputation
imputed_data = imputer.fit_transform(data_to_impute)

# Convert the imputed data back to a DataFrame
completed_data = pd.DataFrame(imputed_data, columns=columns_to_impute)

#### Combine Imputed and Excluded Data

In [None]:
# Combine the imputed data with the excluded columns
imputed_df_RF = pd.concat([completed_data, merged_df_NEW[exclude_columns].reset_index(drop=True)], axis=1)

# Convert the datetime column back to its original form
imputed_df_RF['admittime'] = pd.to_datetime(imputed_df_RF['admittime'] * 10**9)

#### Save Imputed Data and End Timing

In [None]:
# Save the imputed DataFrame to a CSV file
imputed_df_RF.to_csv('RF_imputation_NEW.csv', index=False)

# End timing the imputation process
end_time = time.time()
print("Execution Time:", end_time - start_time, "seconds")