In [1]:
import pandas as pd
from sklearn.impute import *
import numpy as np
from collections import Counter

In [2]:
col_types = {
 'record_id': 'string',
 'abnormal_ln_present': 'categorical',
 'abnormal_ln_size': 'real',
 'dob': 'time',
 'age_at_dx': 'real',
 'biop_to_dx': 'real',
 'surg_to_dx': 'real',
 'men_status': 'categorical',
 'fmhx': 'categorical',
 'gensus___1': 'categorical',
 'gensus___2': 'categorical',
 'gensus___3': 'categorical',
 'gensus___4': 'categorical',
 'gensus___5': 'categorical',
 'gensus___6': 'categorical',
 'gensus___7': 'categorical',
 'systhe___1': 'categorical',
 'systhe___2': 'categorical',
 'systhe___3': 'categorical',
 'systhe___4': 'categorical',
 'systhe___5': 'categorical',
 'tumor_laterality': 'categorical',
 'height_cm': 'real',
 'weight_kg': 'real',
 'bra_cup_size': 'ordinal',
 'bra_cup_size_measure': 'real',
 'palpability': 'categorical',
 'axillary_lymph_node_palpab': 'categorical',
 'dximg___1': 'categorical',
 'dximg___2': 'categorical',
 'dximg___3': 'categorical',
 'dximg_date': 'time',
 'img_size': 'real',
 'bi_rads_score': 'real',
 'foci': 'ordinal',
 'tumor_stge': 'ordinal',
 'abnormal_lymph': 'categorical',
 'lymph_node_max_size_mm': 'real',
 'tumor_size_mm': 'real',
 'microcalcifications': 'categorical',
 'extent_of_calcification_ma': 'real',
 'prominent_axillary_lymph': 'categorical',
 'lymph_node_max_size_mm0': 'real',
 'backgroun_enhancement': 'categorical',
 'max_enhancement_measurment': 'real',
 'axillary_lymphadenopathy': 'categorical',
 'internal_mammary_lymphaden': 'categorical',
 'high_grade_fdg_foci_presen': 'categorical',
 'size_of_the_largest_foci_c': 'real',
 'axillary_lymphadenopathy_p': 'categorical',
 'axillary_lymph_node_max_si': 'real',
 'int_mammary_lymphade_pet': 'categorical',
 'internal_mammary_lymph_nod': 'real',
 'pre_op_biop_date': 'time',
 'pre_op_biopsy': 'categorical',
 'tumor_location': 'categorical',
 'tumor_location_trans': 'real',
 'his_subtype___1': 'categorical',
 'his_subtype___2': 'categorical',
 'his_subtype___3': 'categorical',
 'his_subtype___4': 'categorical',
 'his_subtype___5': 'categorical',
 'his_subtype___6': 'categorical',
 'specify_histology_if_non_o': 'categorical',
 'tumor_grade': 'ordinal',
 'margin_status': 'categorical',
 'closest_margin': 'categorical',
 'closest_margin_trans': 'real',
 'distance_from_closest_marg': 'real',
 'lymphovascular_invasion0': 'categorical',
 'er_status': 'ordinal',
 'pr_status': 'ordinal',
 'her_status': 'ordinal',
 'imaging_and_biopsy_concord': 'ordinal',
 'axillary_lymph_node_core_b': 'categorical',
 'metastatic_carcinoma_on_ax': 'categorical',
 'surgical_indication1_primary_treatment___1': 'categorical',
 'surgical_indication1_primary_treatment___2': 'categorical',
 'surgical_indication1_primary_treatment___3': 'categorical',
 'surgical_indication1_primary_treatment___4': 'categorical',
 'surgical_indication1_primary_treatment___5': 'categorical',
 'laterality': 'categorical',
 'surgery_date': 'time',
 'breast_procedure': 'categorical',
 'axillary_surgery___1': 'categorical',
 'axillary_surgery___2': 'categorical',
 'axillary_surgery___3': 'categorical',
 'lymph_nodes': 'real',
 'sln': 'real',
 'number_of_positive_sln': 'real',
 'ax_nodes': 'real',
 'mastectomy_weight_g': 'real',
 'tumor_size': 'real',
 'tumor_loc': 'categorical',
 'his_type___1': 'categorical',
 'his_type___2': 'categorical',
 'his_type___3': 'categorical',
 'his_type___4': 'categorical',
 'his_type___5': 'categorical',
 'his_type___6': 'categorical',
 'his_type___7': 'categorical',
 'his_type___8': 'categorical',
 'specify_histology_if_other': 'categorical',
 'tu_grade': 'ordinal',
 'tumor_focality': 'categorical',
 'num_foci': 'real',
 'lymphovascular_invasion': 'categorical',
 'in_situ_component_present': 'categorical',
 'in_situ_component_type___1': 'categorical',
 'in_situ_component_type___2': 'categorical',
 'in_situ_component_size_mm': 'real',
 'in_situ_component_grade': 'ordinal',
 'mar_status': 'categorical',
 'clos_margin___1': 'categorical',
 'clos_margin___2': 'categorical',
 'clos_margin___3': 'categorical',
 'clos_margin___4': 'categorical',
 'clos_margin___5': 'categorical',
 'clos_margin___6': 'categorical',
 'clos_margin___7': 'categorical',
 'dis_closest_margin': 'real',
 'metastasis': 'categorical',
 'ln_w_micrometastasis': 'real',
 'ln_w_macrometastasis': 'real',
 'size_of_largest_nodal_meta': 'real',
 'extranodal_extension': 'categorical',
 'extent_of_extranodal_exten': 'real',
 'did_the_patient_receive_pm': 'categorical',
 'did_the_patient_reject_pmr': 'categorical',
 'data_collection_fields_complete': 'categorical'
            }
imputation_dict = {
#     "bi_rads_score": "",
#     "tumor_stge": "",
    "axillary_lymph_node_palpab": 0,
    "abnormal_lymph": 2,
    "lymph_node_max_size_mm": 0,
    "extent_of_calcification_ma": 0,
    "prominent_axillary_lymph": 2,
    "backgroun_enhancement": 2,
    "max_enhancement_measurement": 0,
    "axillary_lymphadenopathy": 2,
    "internal_mammary_lymphaden": 2,
#     "high_grade_fdg_foci_presen": "",
#     "size_of_the_largest_foci_c": "",
    "axillary_lymphadenopathy_p": 0,
#     "axillary_lymph_node_max_si"
    "internal_mammry_lymph_nod": 0,
    "er_status": 0.5,
    "pr_status": 0.5,
    "her_status": 0.5,
    "axillary_lymph_node_core_b": 0,
}


# Pre-operative columns, including some intra columns (e.g., surgery type and estimated surgery date)
pre_cols = ['record_id', 'abnormal_ln_present', "abnormal_ln_size", 'dob', 'age_at_dx', 'biop_to_dx', 'surg_to_dx', 'men_status', 'fmhx', 'gensus___1', 'gensus___2', 'gensus___3', 'gensus___4', 'gensus___5', 'gensus___6', 'gensus___7', 'systhe___1', 'systhe___2', 'systhe___3', 'systhe___4', 'systhe___5', 'tumor_laterality', 'height_cm', 'weight_kg', 'bra_cup_size', 'bra_cup_size_measure', 'palpability', 'axillary_lymph_node_palpab', 'dximg___1', 'dximg___2', 'dximg___3', 'dximg_date', 'img_size', 'foci', 'tumor_stge', 'abnormal_lymph', 'lymph_node_max_size_mm', 'tumor_size_mm', 'microcalcifications', 'extent_of_calcification_ma', 'prominent_axillary_lymph', 'lymph_node_max_size_mm0', 'backgroun_enhancement', 'max_enhancement_measurment', 'axillary_lymphadenopathy', 'internal_mammary_lymphaden', 'high_grade_fdg_foci_presen', 'size_of_the_largest_foci_c', 'axillary_lymphadenopathy_p', 'axillary_lymph_node_max_si', 'int_mammary_lymphade_pet', 'internal_mammary_lymph_nod', 'pre_op_biop_date', 'pre_op_biopsy', 'tumor_location', 'tumor_location_trans', 'his_subtype___1', 'his_subtype___2', 'his_subtype___3', 'his_subtype___4', 'his_subtype___5', 'his_subtype___6', 'specify_histology_if_non_o', 'tumor_grade', 'margin_status', 'closest_margin', 'closest_margin_trans', 'distance_from_closest_marg', 'lymphovascular_invasion0', 'er_status', 'pr_status', 'her_status', 'imaging_and_biopsy_concord', 'axillary_lymph_node_core_b', 'metastatic_carcinoma_on_ax', 'bi_rads_score', 'surgical_indication1_primary_treatment___1', 'surgical_indication1_primary_treatment___2', 'surgical_indication1_primary_treatment___3', 'surgical_indication1_primary_treatment___4', 'surgical_indication1_primary_treatment___5', 'laterality', 'surgery_date', 'breast_procedure', 'axillary_surgery___1', 'axillary_surgery___2', 'axillary_surgery___3']

# Intra columns
intra_cols = ['lymph_nodes', 'sln']

# Post-operative columns, not available during prediction
post_cols = ['number_of_positive_sln', 'ax_nodes', 'mastectomy_weight_g', 'tumor_size', 'tumor_loc', 'his_type___1', 'his_type___2', 'his_type___3', 'his_type___4', 'his_type___5', 'his_type___6', 'his_type___7', 'his_type___8', 'specify_histology_if_other', 'tu_grade', 'tumor_focality', 'num_foci', 'lymphovascular_invasion', 'in_situ_component_present', 'in_situ_component_type___1', 'in_situ_component_type___2', 'in_situ_component_size_mm', 'in_situ_component_grade', 'mar_status', 'clos_margin___1', 'clos_margin___2', 'clos_margin___3', 'clos_margin___4', 'clos_margin___5', 'clos_margin___6', 'clos_margin___7', 'dis_closest_margin', 'metastasis', 'ln_w_micrometastasis', 'ln_w_macrometastasis', 'size_of_largest_nodal_meta', 'extranodal_extension', 'extent_of_extranodal_exten', 'did_the_patient_receive_pm', 'did_the_patient_reject_pmr', 'data_collection_fields_complete']


# Helper to print in terminal with colors
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def my_print(*args, add_sep=False):
    # print with orange
    text = " ".join(args)
    if add_sep:
        text = "-"*50+"\n"+text+"\n"+"-"*50
    print(bcolors.WARNING, text, bcolors.ENDC)



In [3]:
df_path = r"/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/AllTranTrainVal.csv"
metadata_df_path = r"/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/metadata/col_metadata.csv"
df = pd.read_csv(df_path)
df_orig = pd.DataFrame(df)

# Rename each column in DataFrame with prefixes (PRE, POST)
rename_dict = {}
for col in df.columns:
    if col in pre_cols:
        prefix = "PRE"
    elif col in intra_cols:
        prefix = "INT"
    elif col in post_cols:
        prefix = "POS"
    else:
        raise ValueError(f"{col} column doesn't belong to PRE/INTRA/POST columns, please check.")
    rename_dict[col] = prefix+"_"+col
    col_types[prefix+"_"+col] = col_types.pop(col)
df.rename(columns=rename_dict, inplace=True)

# Calculate sparsity of each column
sparsity_dict = {}
for col in df.columns:
    sparsity_dict[col] = df[col].isna().sum() / len(df)

metadata_df = pd.DataFrame.from_dict({k:[sparsity_dict[k], col_types[k]] for k in df.columns})
metadata_df.to_csv(metadata_df_path)

# Preprocess columns by constructing new columns (feature engineering)
abnormal_ln_cols = ['PRE_abnormal_lymph', 'PRE_prominent_axillary_lymph', 'PRE_axillary_lymphadenopathy', 'PRE_internal_mammary_lymphaden', 'PRE_axillary_lymphadenopathy_p', 'PRE_int_mammary_lymphade_pet']
abnormal_ln_size_cols = ['PRE_lymph_node_max_size_mm', 'PRE_lymph_node_max_size_mm0', 'PRE_axillary_lymph_node_max_si', 'PRE_internal_mammary_lymph_nod']
if "PRE_abnormal_ln_size" not in df.columns:
    abnormal_ln_sizes = []
    for i, row in df.iterrows():
        max_size = 0
        for col in abnormal_ln_size_cols:
            value = row[col]
            if str(value) == "nan":
                continue
            max_size = max(max_size, value)
        abnormal_ln_sizes.append(str(max_size))
    df.insert(10, "PRE_abnormal_ln_size", abnormal_ln_sizes)

if "PRE_abnormal_ln_present" not in df.columns:
    abnormal_ln = []
    for i, row in df.iterrows():
        cur = "0"
        for col in abnormal_ln_cols:
            value = row[col]
            if str(value) == "nan":
                continue
            else:
                if str(value).strip().replace(".0","")  == "1":
                    cur = "1"
        abnormal_ln.append(cur)
    df.insert(11, "PRE_abnormal_ln_present",abnormal_ln)


# assert set(col_types) == set(df.columns) 

In [4]:
def pre_process(df):
    # Convert Time columns into Numeric columns
    for col, c_type in col_types.items():
        if c_type == "time":
            my_print(f"Converted {col} from time to numeric type.")
            df[col] = df[col].apply(lambda x: pd.to_datetime(x).value)
    return df

def expert_impute(df):
    for col, val in imputation_dict.items():
        if col not in df.columns:
            my_print(f"Skipped {col} not in DataFrame.")
        else:
            my_print(f"Imputed {col} with {val}.")
            df[col].fillna(value=val, inplace=True)
    return df
    
df = pre_process(df)
df = expert_impute(df)
df.head()

[93m Converted PRE_dob from time to numeric type. [0m
[93m Converted PRE_dximg_date from time to numeric type. [0m
[93m Converted PRE_pre_op_biop_date from time to numeric type. [0m
[93m Converted PRE_surgery_date from time to numeric type. [0m
[93m Skipped axillary_lymph_node_palpab not in DataFrame. [0m
[93m Skipped abnormal_lymph not in DataFrame. [0m
[93m Skipped lymph_node_max_size_mm not in DataFrame. [0m
[93m Skipped extent_of_calcification_ma not in DataFrame. [0m
[93m Skipped prominent_axillary_lymph not in DataFrame. [0m
[93m Skipped backgroun_enhancement not in DataFrame. [0m
[93m Skipped max_enhancement_measurement not in DataFrame. [0m
[93m Skipped axillary_lymphadenopathy not in DataFrame. [0m
[93m Skipped internal_mammary_lymphaden not in DataFrame. [0m
[93m Skipped axillary_lymphadenopathy_p not in DataFrame. [0m
[93m Skipped internal_mammry_lymph_nod not in DataFrame. [0m
[93m Skipped er_status not in DataFrame. [0m
[93m Skipped pr_stat

Unnamed: 0,PRE_record_id,PRE_dob,PRE_men_status,PRE_fmhx,PRE_gensus___1,PRE_gensus___2,PRE_gensus___3,PRE_gensus___4,PRE_gensus___5,PRE_gensus___6,...,POS_dis_closest_margin,POS_metastasis,POS_ln_w_micrometastasis,POS_ln_w_macrometastasis,POS_size_of_largest_nodal_meta,POS_extranodal_extension,POS_extent_of_extranodal_exten,POS_did_the_patient_receive_pm,POS_did_the_patient_reject_pmr,POS_data_collection_fields_complete
0,ANN001,-291945600000000000,2.0,0.0,0,0,0,0,0,1,...,,,,,,,,0.0,,2
1,ANN002,-126230400000000000,2.0,0.0,0,0,0,0,0,1,...,20.0,2.0,,,,2.0,,0.0,,2
2,ANN003,-239414400000000000,2.0,0.0,0,0,0,0,0,1,...,1.0,2.0,,,,2.0,,0.0,,2
3,ANN004,-289267200000000000,2.0,1.0,1,0,0,0,0,0,...,3.0,2.0,,,,2.0,,0.0,,2
4,ANN005,-323568000000000000,2.0,0.0,0,0,0,0,0,1,...,,1.0,0.0,1.0,5.0,1.0,,1.0,,2


In [5]:
columns = sorted(df.columns, key=lambda x: df[x].isna().sum())
sparsity_threshold = 0.05 # If more than sparsity_threshold of cells are missing, then don't impute the column at all
# Remove sparse columns
for y_col in list(columns):
    if y_col == "record_id":
        my_print(f"Removed record_id")
        columns.remove(y_col)
    if df[y_col].isna().sum() / len(df) > sparsity_threshold:
        my_print(f"{y_col} missingness {df[y_col].isna().sum() / len(df)} above threshold. Skipped")
        columns.remove(y_col)
pre_cols = ["PRE_"+e for e in pre_cols]
print(f"{len(columns)} out of {len(df.columns)} all columns are solid and are thus kept.")
pre_solid_cols = set(pre_cols).intersection(columns).union(set(['POS_did_the_patient_receive_pm']))
print(f"{len(pre_solid_cols)} out of {len(columns)} solid columns are PRE and are thus kept.")

very_solid_df = pd.DataFrame(df[pre_solid_cols])

very_solid_df.head()

[93m PRE_er_status missingness 0.06166666666666667 above threshold. Skipped [0m
[93m PRE_tumor_grade missingness 0.065 above threshold. Skipped [0m
[93m PRE_imaging_and_biopsy_concord missingness 0.06666666666666667 above threshold. Skipped [0m
[93m PRE_axillary_lymph_node_core_b missingness 0.08 above threshold. Skipped [0m
[93m POS_mar_status missingness 0.125 above threshold. Skipped [0m
[93m POS_extranodal_extension missingness 0.13166666666666665 above threshold. Skipped [0m
[93m POS_ax_nodes missingness 0.13666666666666666 above threshold. Skipped [0m
[93m INT_sln missingness 0.13833333333333334 above threshold. Skipped [0m
[93m POS_metastasis missingness 0.13833333333333334 above threshold. Skipped [0m
[93m INT_lymph_nodes missingness 0.14 above threshold. Skipped [0m
[93m PRE_pr_status missingness 0.15333333333333332 above threshold. Skipped [0m
[93m PRE_her_status missingness 0.155 above threshold. Skipped [0m
[93m POS_lymphovascular_invasion missingne

Unnamed: 0,PRE_dob,PRE_dximg___2,PRE_gensus___5,PRE_laterality,PRE_systhe___4,PRE_fmhx,PRE_gensus___6,PRE_surgical_indication1_primary_treatment___4,PRE_systhe___2,PRE_surgical_indication1_primary_treatment___5,...,PRE_axillary_surgery___1,PRE_axillary_surgery___2,PRE_axillary_surgery___3,PRE_his_subtype___4,PRE_men_status,PRE_dximg___1,POS_did_the_patient_receive_pm,PRE_gensus___3,PRE_abnormal_ln_size,PRE_surgical_indication1_primary_treatment___2
0,-291945600000000000,1,0,3.0,0,0.0,1,0,0,0,...,0,0,1,0,2.0,1,0.0,0,31.0,0
1,-126230400000000000,1,0,1.0,1,0.0,1,0,0,0,...,0,1,0,0,2.0,1,0.0,0,0.0,0
2,-239414400000000000,1,0,1.0,1,0.0,1,0,0,0,...,0,1,0,0,2.0,0,0.0,0,0.0,0
3,-289267200000000000,0,0,3.0,0,1.0,0,0,0,0,...,0,1,0,0,2.0,1,0.0,0,0.0,0
4,-323568000000000000,1,0,3.0,0,0.0,1,0,0,0,...,0,1,0,0,2.0,1,1.0,0,0.0,0


In [6]:
columns = sorted(df.columns, key=lambda x: df[x].isna().sum())
sparsity_threshold = 0.2 # If more than sparsity_threshold of cells are missing, then don't impute the column at all
# Remove sparse columns
for y_col in list(columns):
    if y_col == "record_id":
        my_print(f"Removed record_id")
        columns.remove(y_col)
    if df[y_col].isna().sum() / len(df) > sparsity_threshold:
        my_print(f"{y_col} missingness {round(df[y_col].isna().sum() / len(df), 3)} above threshold. Skipped")
        columns.remove(y_col)
        
print(f"{len(columns)} out of {len(df.columns)} all columns are solid and are thus kept.")
pre_solid_cols = set(pre_cols).intersection(columns).union(set(['POS_did_the_patient_receive_pm']))
print(f"{len(pre_solid_cols)} out of {len(columns)} dense columns are PRE and are thus kept.")

df = pd.DataFrame(df[pre_solid_cols])

df.head()

[93m POS_in_situ_component_present missingness 0.212 above threshold. Skipped [0m
[93m PRE_abnormal_lymph missingness 0.242 above threshold. Skipped [0m
[93m PRE_img_size missingness 0.245 above threshold. Skipped [0m
[93m PRE_prominent_axillary_lymph missingness 0.27 above threshold. Skipped [0m
[93m POS_tu_grade missingness 0.28 above threshold. Skipped [0m
[93m POS_did_the_patient_reject_pmr missingness 0.307 above threshold. Skipped [0m
[93m POS_dis_closest_margin missingness 0.322 above threshold. Skipped [0m
[93m POS_tumor_size missingness 0.327 above threshold. Skipped [0m
[93m POS_tumor_focality missingness 0.387 above threshold. Skipped [0m
[93m POS_in_situ_component_grade missingness 0.442 above threshold. Skipped [0m
[93m PRE_tumor_location missingness 0.48 above threshold. Skipped [0m
[93m PRE_tumor_location_trans missingness 0.482 above threshold. Skipped [0m
[93m PRE_tumor_size_mm missingness 0.555 above threshold. Skipped [0m
[93m PRE_weight_kg

Unnamed: 0,PRE_dob,PRE_dximg___2,PRE_gensus___5,PRE_laterality,PRE_systhe___4,PRE_fmhx,PRE_gensus___6,PRE_surgical_indication1_primary_treatment___4,PRE_systhe___2,PRE_her_status,...,PRE_axillary_surgery___3,PRE_his_subtype___4,PRE_men_status,PRE_dximg___1,POS_did_the_patient_receive_pm,PRE_imaging_and_biopsy_concord,PRE_gensus___3,PRE_abnormal_ln_size,PRE_er_status,PRE_surgical_indication1_primary_treatment___2
0,-291945600000000000,1,0,3.0,0,0.0,1,0,0,2.0,...,1,0,2.0,1,0.0,1.0,0,31.0,2.0,0
1,-126230400000000000,1,0,1.0,1,0.0,1,0,0,,...,0,0,2.0,1,0.0,1.0,0,0.0,1.0,0
2,-239414400000000000,1,0,1.0,1,0.0,1,0,0,2.0,...,0,0,2.0,0,0.0,1.0,0,0.0,1.0,0
3,-289267200000000000,0,0,3.0,0,1.0,0,0,0,2.0,...,0,0,2.0,1,0.0,1.0,0,0.0,2.0,0
4,-323568000000000000,1,0,3.0,0,0.0,1,0,0,1.0,...,0,0,2.0,1,1.0,1.0,0,0.0,1.0,0


In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_KNN = KNNImputer(n_neighbors=3)
imp_RF = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10))

In [8]:
def impute(imputer, df):
    new_df = pd.DataFrame(imputer.fit_transform(df))
    new_df.columns = df.columns
    new_df.index = df.index
    return new_df

df = df.drop("PRE_record_id", axis=1)

In [9]:
df_mode = impute(imp_mode, df)

In [10]:
df_mean = impute(imp_mean, df)

In [11]:
df_KNN = impute(imp_KNN, df)

In [12]:
df_RF = impute(imp_RF, df)

In [13]:
import random
from collections import defaultdict
df_compare_path = r"D:\\YifuChen\\Jan20-2022\\H2ODAI\\dai-1.10.1\\data\\Impute_Comparison.csv"
compare_df = pd.DataFrame()
sample_idx = very_solid_df.sample(frac=0.1).index
avg_rmse = defaultdict(int)
for col in df.columns:
    try:
        my_print(f"Imputing {col}...")
        imp_results = {}
        temp_df = very_solid_df.copy(deep=True)
        if col not in temp_df.columns:
            print(f"{col} is solid")
            temp_df[col] = df[col]
        else:
            print(f"{col} is very solid")
        temp_df.loc[sample_idx, col] = np.nan
        compare_df[f"{col}_gt"] = df.loc[sample_idx, col]
        for (name, imp) in zip(["mode", "mean", "KNN", "RF"], [imp_mode, imp_mean, imp_KNN, imp_RF]):
            imp_df = impute(imp, temp_df)
            rmse = ((df.loc[sample_idx, col] - imp_df.loc[sample_idx, col]) ** 2).mean() ** .5
            imp_results[imp] = {"data": imp_df, "rmse": rmse}
            print(name, rmse)
            avg_rmse[name] += rmse
            compare_df[f"{col}_{name}"] = imp_df.loc[sample_idx, col]
        compare_df.to_csv(df_compare_path)
    except:
        continue

[93m Imputing PRE_dob... [0m
PRE_dob is very solid
mode 3.7861091650011366e+17
[93m Imputing PRE_dximg___2... [0m
PRE_dximg___2 is very solid
mode 0.5163977794943222
[93m Imputing PRE_gensus___5... [0m
PRE_gensus___5 is very solid
mode 0.12909944487358055
[93m Imputing PRE_laterality... [0m
PRE_laterality is very solid
mode 0.806225774829855
[93m Imputing PRE_systhe___4... [0m
PRE_systhe___4 is very solid
mode 0.5
[93m Imputing PRE_fmhx... [0m
PRE_fmhx is very solid
mode 0.6191391873668903
[93m Imputing PRE_gensus___6... [0m
PRE_gensus___6 is very solid
mode 0.4281744192888376
[93m Imputing PRE_surgical_indication1_primary_treatment___4... [0m
PRE_surgical_indication1_primary_treatment___4 is very solid
mode 0.18257418583505536
[93m Imputing PRE_systhe___2... [0m
PRE_systhe___2 is very solid
mode 0.0
[93m Imputing PRE_her_status... [0m
PRE_her_status is solid
mode 0.5091750772173156
[93m Imputing PRE_surgical_indication1_primary_treatment___5... [0m
PRE_surgical_i