In [1]:
import pandas as pd
from sklearn.impute import *
import numpy as np
from collections import Counter

In [2]:
col_types = {
    'record_id': 'string',
    'abnormal_ln_present': 'enum',
    'abnormal_ln_size': 'real',
 'dob': 'time',
 'age_at_dx': 'real',
 'biop_to_dx': 'real',
 'surg_to_dx': 'real',
 'men_status': 'enum',
 'fmhx': 'int',
 'gensus___1': 'enum',
 'gensus___2': 'enum',
 'gensus___3': 'enum',
 'gensus___4': 'enum',
 'gensus___5': 'enum',
 'gensus___6': 'enum',
 'gensus___7': 'enum',
 'systhe___1': 'enum',
 'systhe___2': 'enum',
 'systhe___3': 'enum',
 'systhe___4': 'enum',
 'systhe___5': 'enum',
 'tumor_laterality': 'enum',
 'height_cm': 'real',
 'weight_kg': 'real',
 'bra_cup_size': 'enum',
 'bra_cup_size_measure': 'real',
 'palpability': 'enum',
 'axillary_lymph_node_palpab': 'enum',
 'dximg___1': 'enum',
 'dximg___2': 'enum',
 'dximg___3': 'enum',
 'dximg_date': 'time',
 'img_size': 'real',
 'bi_rads_score': 'enum',
 'foci': 'enum',
 'tumor_stge': 'enum',
 'abnormal_lymph': 'enum',
 'lymph_node_max_size_mm': 'real',
 'tumor_size_mm': 'real',
 'microcalcifications': 'enum',
 'extent_of_calcification_ma': 'enum',
 'prominent_axillary_lymph': 'enum',
 'lymph_node_max_size_mm0': 'real',
 'backgroun_enhancement': 'enum',
 'max_enhancement_measurment': 'real',
 'axillary_lymphadenopathy': 'enum',
 'internal_mammary_lymphaden': 'enum',
 'high_grade_fdg_foci_presen': 'enum',
 'size_of_the_largest_foci_c': 'real',
 'axillary_lymphadenopathy_p': 'enum',
 'axillary_lymph_node_max_si': 'enum',
 'int_mammary_lymphade_pet': 'real',
 'internal_mammary_lymph_nod': 'real',
 'pre_op_biop_date': 'time',
 'pre_op_biopsy': 'enum',
 'tumor_location': 'enum',
 'tumor_location_trans': 'int',
 'his_subtype___1': 'enum',
 'his_subtype___2': 'enum',
 'his_subtype___3': 'enum',
 'his_subtype___4': 'enum',
 'his_subtype___5': 'enum',
 'his_subtype___6': 'enum',
 'specify_histology_if_non_o': 'enum',
 'tumor_grade': 'enum',
 'margin_status': 'enum',
 'closest_margin': 'enum',
 'closest_margin_trans': 'enum',
 'distance_from_closest_marg': 'real',
 'lymphovascular_invasion0': 'enum',
 'er_status': 'enum',
 'pr_status': 'enum',
 'her_status': 'enum',
 'imaging_and_biopsy_concord': 'enum',
 'axillary_lymph_node_core_b': 'enum',
 'metastatic_carcinoma_on_ax': 'enum',
 'surgical_indication1_primary_treatment___1': 'enum',
 'surgical_indication1_primary_treatment___2': 'enum',
 'surgical_indication1_primary_treatment___3': 'enum',
 'surgical_indication1_primary_treatment___4': 'enum',
 'surgical_indication1_primary_treatment___5': 'enum',
 'laterality': 'enum',
 'surgery_date': 'time',
 'breast_procedure': 'enum',
 'axillary_surgery___1': 'enum',
 'axillary_surgery___2': 'enum',
 'axillary_surgery___3': 'enum',
 'lymph_nodes': 'int',
 'sln': 'int',
 'number_of_positive_sln': 'int',
 'ax_nodes': 'int',
 'mastectomy_weight_g': 'real',
 'tumor_size': 'real',
 'tumor_loc': 'enum',
 'his_type___1': 'enum',
 'his_type___2': 'enum',
 'his_type___3': 'enum',
 'his_type___4': 'enum',
 'his_type___5': 'enum',
 'his_type___6': 'enum',
 'his_type___7': 'enum',
 'his_type___8': 'enum',
 'specify_histology_if_other': 'enum',
 'tu_grade': 'enum',
 'tumor_focality': 'enum',
 'num_foci': 'real',
 'lymphovascular_invasion': 'enum',
 'in_situ_component_present': 'enum',
 'in_situ_component_type___1': 'enum',
 'in_situ_component_type___2': 'enum',
 'in_situ_component_size_mm': 'real',
 'in_situ_component_grade': 'enum',
 'mar_status': 'enum',
 'clos_margin___1': 'enum',
 'clos_margin___2': 'enum',
 'clos_margin___3': 'enum',
 'clos_margin___4': 'enum',
 'clos_margin___5': 'enum',
 'clos_margin___6': 'enum',
 'clos_margin___7': 'enum',
 'dis_closest_margin': 'real',
 'metastasis': 'enum',
 'ln_w_micrometastasis': 'int',
 'ln_w_macrometastasis': 'int',
 'size_of_largest_nodal_meta': 'real',
 'extranodal_extension': 'enum',
 'extent_of_extranodal_exten': 'real',
 'did_the_patient_receive_pm': 'enum',
 'did_the_patient_reject_pmr': 'enum',
 'data_collection_fields_complete': 'enum'
            }
imputation_dict = {
#     "bi_rads_score": "",
#     "tumor_stge": "",
    "axillary_lymph_node_palpab": 0,
    "abnormal_lymph": 2,
    "lymph_node_max_size_mm": 0,
    "extent_of_calcification_ma": 0,
    "prominent_axillary_lymph": 2,
    "backgroun_enhancement": 2,
    "max_enhancement_measurement": 0,
    "axillary_lymphadenopathy": 2,
    "internal_mammary_lymphaden": 2,
#     "high_grade_fdg_foci_presen": "",
#     "size_of_the_largest_foci_c": "",
    "axillary_lymphadenopathy_p": 0,
#     "axillary_lymph_node_max_si"
    "internal_mammry_lymph_nod": 0,
    "er_status": 0.5,
    "pr_status": 0.5,
    "her_status": 0.5,
    "axillary_lymph_node_core_b": 0,
}

# Helper to print in terminal with colors
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def my_print(*args, add_sep=False):
    # print with orange
    text = " ".join(args)
    if add_sep:
        text = "-"*50+"\n"+text+"\n"+"-"*50
    print(bcolors.WARNING, text, bcolors.ENDC)



In [4]:
df_path = r"/Users/yifuchen/Work/Notebooks/data/AllTranTrainVal.csv"
df = pd.read_csv(df_path)
df_orig = pd.DataFrame(df)
# Preprocess columns by constructing new columns (feature engineering)
abnormal_ln_cols = ["abnormal_lymph", "prominent_axillary_lymph", "axillary_lymphadenopathy", "internal_mammary_lymphaden", "axillary_lymphadenopathy_p", "int_mammary_lymphade_pet"]
abnormal_ln_size_cols = ["lymph_node_max_size_mm", "lymph_node_max_size_mm0", "axillary_lymph_node_max_si", "internal_mammary_lymph_nod"]
if "abnormal_ln_size" not in df.columns:
    abnormal_ln_sizes = []
    for i, row in df.iterrows():
        max_size = 0
        for col in abnormal_ln_size_cols:
            value = row[col]
            if str(value) == "nan":
                continue
            max_size = max(max_size, value)
        abnormal_ln_sizes.append(str(max_size))
    df.insert(10, "abnormal_ln_size", abnormal_ln_sizes)

if "abnormal_ln_present" not in df.columns:
    abnormal_ln = []
    for i, row in df.iterrows():
        cur = "0"
        for col in abnormal_ln_cols:
            value = row[col]
            if str(value) == "nan":
                continue
            else:
                if str(value).strip().replace(".0","")  == "1":
                    cur = "1"
        abnormal_ln.append(cur)
    df.insert(11, "abnormal_ln_present",abnormal_ln)


    
pre_cols = ['abnormal_ln_present', "abnormal_ln_size", 'dob', 'age_at_dx', 'biop_to_dx', 'surg_to_dx', 'men_status', 'fmhx', 'gensus___1', 'gensus___2', 'gensus___3', 'gensus___4', 'gensus___5', 'gensus___6', 'gensus___7', 'systhe___1', 'systhe___2', 'systhe___3', 'systhe___4', 'systhe___5', 'tumor_laterality', 'height_cm', 'weight_kg', 'bra_cup_size', 'bra_cup_size_measure', 'palpability', 'axillary_lymph_node_palpab', 'dximg___1', 'dximg___2', 'dximg___3', 'dximg_date', 'img_size', 'foci', 'tumor_stge', 'abnormal_lymph', 'lymph_node_max_size_mm', 'tumor_size_mm', 'microcalcifications', 'extent_of_calcification_ma', 'prominent_axillary_lymph', 'lymph_node_max_size_mm0', 'backgroun_enhancement', 'max_enhancement_measurment', 'axillary_lymphadenopathy', 'internal_mammary_lymphaden', 'high_grade_fdg_foci_presen', 'size_of_the_largest_foci_c', 'axillary_lymphadenopathy_p', 'axillary_lymph_node_max_si', 'int_mammary_lymphade_pet', 'internal_mammary_lymph_nod', 'pre_op_biop_date', 'pre_op_biopsy', 'tumor_location', 'tumor_location_trans', 'his_subtype___1', 'his_subtype___2', 'his_subtype___3', 'his_subtype___4', 'his_subtype___5', 'his_subtype___6', 'specify_histology_if_non_o', 'tumor_grade', 'margin_status', 'closest_margin', 'closest_margin_trans', 'distance_from_closest_marg', 'lymphovascular_invasion0', 'er_status', 'pr_status', 'her_status', 'imaging_and_biopsy_concord', 'axillary_lymph_node_core_b', 'metastatic_carcinoma_on_ax']

# Intra-operative columns, assumed not available during prediction
intra_cols = ['surgical_indication1_primary_treatment___1', 'surgical_indication1_primary_treatment___2', 'surgical_indication1_primary_treatment___3', 'surgical_indication1_primary_treatment___4', 'surgical_indication1_primary_treatment___5', 'laterality', 'surgery_date', 'breast_procedure', 'axillary_surgery___1', 'axillary_surgery___2', 'axillary_surgery___3', 'lymph_nodes', 'sln']

# Post-operative columns, not available during prediction
post_cols = ['number_of_positive_sln', 'ax_nodes', 'mastectomy_weight_g', 'tumor_size', 'tumor_loc', 'his_type___1', 'his_type___2', 'his_type___3', 'his_type___4', 'his_type___5', 'his_type___6', 'his_type___7', 'his_type___8', 'specify_histology_if_other', 'tu_grade', 'tumor_focality', 'num_foci', 'lymphovascular_invasion', 'in_situ_component_present', 'in_situ_component_type___1', 'in_situ_component_type___2', 'in_situ_component_size_mm', 'in_situ_component_grade', 'mar_status', 'clos_margin___1', 'clos_margin___2', 'clos_margin___3', 'clos_margin___4', 'clos_margin___5', 'clos_margin___6', 'clos_margin___7', 'dis_closest_margin', 'metastasis', 'ln_w_micrometastasis', 'ln_w_macrometastasis', 'size_of_largest_nodal_meta', 'extranodal_extension', 'extent_of_extranodal_exten']

# assert set(col_types) == set(df.columns) 

In [5]:
def pre_process(df):
    # Convert Time columns into Numeric columns
    for col, c_type in col_types.items():
        if c_type == "time":
            my_print(f"Converted {col} from time to numeric type.")
            df[col] = df[col].apply(lambda x: pd.to_datetime(x).value)
    return df

def expert_impute(df):
    for col, val in imputation_dict.items():
        if col not in df.columns:
            my_print(f"Skipped {col} not in DataFrame.")
        else:
            my_print(f"Imputed {col} with {val}.")
            df[col].fillna(value=val, inplace=True)
    return df
    
df = pre_process(df)
df = expert_impute(df)
df.head()

[93m Converted dob from time to numeric type. [0m
[93m Converted dximg_date from time to numeric type. [0m
[93m Converted pre_op_biop_date from time to numeric type. [0m
[93m Converted surgery_date from time to numeric type. [0m
[93m Imputed axillary_lymph_node_palpab with 0. [0m
[93m Imputed abnormal_lymph with 2. [0m
[93m Imputed lymph_node_max_size_mm with 0. [0m
[93m Imputed extent_of_calcification_ma with 0. [0m
[93m Imputed prominent_axillary_lymph with 2. [0m
[93m Imputed backgroun_enhancement with 2. [0m
[93m Skipped max_enhancement_measurement not in DataFrame. [0m
[93m Imputed axillary_lymphadenopathy with 2. [0m
[93m Imputed internal_mammary_lymphaden with 2. [0m
[93m Imputed axillary_lymphadenopathy_p with 0. [0m
[93m Skipped internal_mammry_lymph_nod not in DataFrame. [0m
[93m Imputed er_status with 0.5. [0m
[93m Imputed pr_status with 0.5. [0m
[93m Imputed her_status with 0.5. [0m
[93m Imputed axillary_lymph_node_core_b with 0. [0m


Unnamed: 0,record_id,dob,men_status,fmhx,gensus___1,gensus___2,gensus___3,gensus___4,gensus___5,gensus___6,...,dis_closest_margin,metastasis,ln_w_micrometastasis,ln_w_macrometastasis,size_of_largest_nodal_meta,extranodal_extension,extent_of_extranodal_exten,did_the_patient_receive_pm,did_the_patient_reject_pmr,data_collection_fields_complete
0,ANN001,-291945600000000000,2.0,0.0,0,0,0,0,0,1,...,,,,,,,,0.0,,2
1,ANN002,-126230400000000000,2.0,0.0,0,0,0,0,0,1,...,20.0,2.0,,,,2.0,,0.0,,2
2,ANN003,-239414400000000000,2.0,0.0,0,0,0,0,0,1,...,1.0,2.0,,,,2.0,,0.0,,2
3,ANN004,-289267200000000000,2.0,1.0,1,0,0,0,0,0,...,3.0,2.0,,,,2.0,,0.0,,2
4,ANN005,-323568000000000000,2.0,0.0,0,0,0,0,0,1,...,,1.0,0.0,1.0,5.0,1.0,,1.0,,2


In [10]:
columns = sorted(df.columns, key=lambda x: df[x].isna().sum())
sparsity_threshold = 0.05 # If more than sparsity_threshold of cells are missing, then don't impute the column at all
# Remove sparse columns
for y_col in list(columns):
    if y_col == "record_id":
        my_print(f"Removed record_id")
        columns.remove(y_col)
    if df[y_col].isna().sum() / len(df) > sparsity_threshold:
        my_print(f"{y_col} missingness {df[y_col].isna().sum() / len(df)} above threshold. Skipped")
        columns.remove(y_col)
        
print(f"{len(columns)} out of {len(df.columns)} all columns are solid and are thus kept.")
pre_solid_cols = set(pre_cols).intersection(columns).union(set(['did_the_patient_receive_pm']))
print(f"{len(pre_solid_cols)} out of {len(columns)} solid columns are PRE and are thus kept.")

very_solid_df = pd.DataFrame(df[pre_solid_cols])

very_solid_df.head()

[93m tumor_grade missingness 0.065 above threshold. Skipped [0m
[93m imaging_and_biopsy_concord missingness 0.06666666666666667 above threshold. Skipped [0m
[93m lymphovascular_invasion0 missingness 0.17333333333333334 above threshold. Skipped [0m
[93m microcalcifications missingness 0.19833333333333333 above threshold. Skipped [0m
45 out of 49 all columns are solid and are thus kept.
45 out of 45 solid columns are PRE and are thus kept.


Unnamed: 0,dob,gensus___3,axillary_lymphadenopathy_p,her_status,his_subtype___6,gensus___5,extent_of_calcification_ma,abnormal_ln_present,his_subtype___5,systhe___5,...,palpability,abnormal_ln_size,gensus___7,pre_op_biop_date,his_subtype___1,fmhx,his_subtype___2,axillary_lymphadenopathy,lymph_node_max_size_mm,abnormal_lymph
0,-291945600000000000,0,1.0,2.0,0,0,0.0,1,0,0,...,1.0,31.0,0,-9223372036854775808,0,0.0,0,2.0,27.0,1.0
1,-126230400000000000,0,0.0,0.5,0,0,36.0,0,0,0,...,0.0,0.0,0,-9223372036854775808,0,0.0,0,2.0,0.0,2.0
2,-239414400000000000,0,0.0,2.0,0,0,0.0,0,0,0,...,1.0,0.0,0,1530662400000000000,1,0.0,0,0.0,0.0,2.0
3,-289267200000000000,0,0.0,2.0,0,0,0.0,0,0,0,...,1.0,0.0,0,1539648000000000000,1,1.0,0,2.0,0.0,2.0
4,-323568000000000000,0,0.0,1.0,0,0,0.0,0,0,0,...,0.0,0.0,0,1416873600000000000,1,0.0,0,2.0,0.0,3.0


In [11]:
columns = sorted(df.columns, key=lambda x: df[x].isna().sum())
sparsity_threshold = 0.2 # If more than sparsity_threshold of cells are missing, then don't impute the column at all
# Remove sparse columns
for y_col in list(columns):
    if y_col == "record_id":
        my_print(f"Removed record_id")
        columns.remove(y_col)
    if df[y_col].isna().sum() / len(df) > sparsity_threshold:
        my_print(f"{y_col} missingness {round(df[y_col].isna().sum() / len(df), 3)} above threshold. Skipped")
        columns.remove(y_col)
        
print(f"{len(columns)} out of {len(df.columns)} all columns are solid and are thus kept.")
pre_solid_cols = set(pre_cols).intersection(columns).union(set(['did_the_patient_receive_pm']))
print(f"{len(pre_solid_cols)} out of {len(columns)} dense columns are PRE and are thus kept.")

df = pd.DataFrame(df[pre_solid_cols])

df.head()

49 out of 49 all columns are solid and are thus kept.
49 out of 49 dense columns are PRE and are thus kept.


Unnamed: 0,dob,gensus___3,axillary_lymphadenopathy_p,her_status,his_subtype___6,gensus___5,extent_of_calcification_ma,abnormal_ln_present,his_subtype___5,systhe___5,...,palpability,abnormal_ln_size,gensus___7,pre_op_biop_date,his_subtype___1,fmhx,his_subtype___2,axillary_lymphadenopathy,lymph_node_max_size_mm,abnormal_lymph
0,-291945600000000000,0,1.0,2.0,0,0,0.0,1,0,0,...,1.0,31.0,0,-9223372036854775808,0,0.0,0,2.0,27.0,1.0
1,-126230400000000000,0,0.0,0.5,0,0,36.0,0,0,0,...,0.0,0.0,0,-9223372036854775808,0,0.0,0,2.0,0.0,2.0
2,-239414400000000000,0,0.0,2.0,0,0,0.0,0,0,0,...,1.0,0.0,0,1530662400000000000,1,0.0,0,0.0,0.0,2.0
3,-289267200000000000,0,0.0,2.0,0,0,0.0,0,0,0,...,1.0,0.0,0,1539648000000000000,1,1.0,0,2.0,0.0,2.0
4,-323568000000000000,0,0.0,1.0,0,0,0.0,0,0,0,...,0.0,0.0,0,1416873600000000000,1,0.0,0,2.0,0.0,3.0


In [None]:
from sklearn.ensemble import RandomForestRegressor
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_KNN = KNNImputer(n_neighbors=3)
imp_RF = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10))

In [None]:
def impute(imputer, df):
    new_df = pd.DataFrame(imputer.fit_transform(df))
    new_df.columns = df.columns
    new_df.index = df.index
    return new_df

In [None]:
df_mode = impute(imp_mode, df)

In [None]:
df_mean = impute(imp_mean, df)

In [None]:
df_KNN = impute(imp_KNN, df)

In [None]:
df_RF = impute(imp_RF, df)

In [None]:
import random
from collections import defaultdict
df_compare_path = r"D:\\YifuChen\\Jan20-2022\\H2ODAI\\dai-1.10.1\\data\\Impute_Comparison.csv"
compare_df = pd.DataFrame()
sample_idx = temp_df.sample(frac=0.1).index
avg_rmse = defaultdict(int)
for col in df.columns:
    try:
        my_print(f"Imputing {col}...")
        imp_results = {}
        temp_df = very_solid_df.copy(deep=True)
        if col not in temp_df.columns:
            print(f"{col} is solid")
            temp_df[col] = df[col]
        else:
            print(f"{col} is very solid")
        temp_df.loc[sample_idx, col] = np.nan
        compare_df[f"{col}_gt"] = df.loc[sample_idx, col]
        for (name, imp) in zip(["mode", "mean", "KNN", "RF"], [imp_mode, imp_mean, imp_KNN, imp_RF]):
            imp_df = impute(imp, temp_df)
            rmse = ((df.loc[sample_idx, col] - imp_df.loc[sample_idx, col]) ** 2).mean() ** .5
            imp_results[imp] = {"data": imp_df, "rmse": rmse}
            print(name, rmse)
            avg_rmse[name] += rmse
            compare_df[f"{col}_{name}"] = imp_df.loc[sample_idx, col]
        compare_df.to_csv(df_compare_path)
    except:
        continue