# Baseline and Feature Selection for Imputer Value vs Missing Rates Analysis

### Get all data from baseline and feature selection pipeline for imputation evaluation only. RMSE only. Baseline is no feature selection

In [1]:
# {"Chi_square": "/home/abhishek/Desktop/Thesis/Thesis/Feature_Selection/RFE",
#                                  "correlation_coefficient": "/home/abhishek/Desktop/Thesis/Thesis/Feature_Selection/RFECV",
#                                  "GA": "/home/abhishek/Desktop/Thesis/Thesis/Feature_Selection/SelectKBest",
#                                 "IG": "/home/abhishek/Desktop/Thesis/Thesis/Feature_Selection/SelectFromModel",}

In [2]:
import os
import pandas as pd

# # Initialize the dictionary to store DataFrames
# dataframes = {}

# Function to load data from the experiment directory
def load_experiment_data(dataframes,directory,experiment_type_name):
    # Traverse through all files and directories in the given directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check for baseline data in the baseline folder
            if "baseline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)
                # Load the baseline data into the 'baseline' DataFrame
                if 'baseline' not in dataframes:
                    dataframes['baseline'] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing baseline DataFrame
                    data = pd.read_csv(file_path)
                    dataframes['baseline'] = pd.concat([dataframes['baseline'], data], ignore_index=True)

            # Check for data in fs_pipeline folder
            elif "fs_pipeline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)

                prefix = file.split(experiment_type_name)[0].rstrip('_')  # Remove trailing '_'
                
                # Load the data into the appropriate DataFrame named by the prefix
                if prefix not in dataframes:
                    dataframes[prefix] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing DataFrame
                    data = pd.read_csv(file_path)
                    dataframes[prefix] = pd.concat([dataframes[prefix], data], ignore_index=True)
    return dataframes

    





# code takes directory that contains all the data.
# Two directories, Baseline and  multiple directories for different types of feature selection





In [3]:
# # Initialize a new dictionary to store DataFrames without NA values
# dataframes_no_na = {}
def drop_na(dataframes, dataframes_no_na):
    # Iterate through each DataFrame in the original dictionary
    for key, df in dataframes.items():
        df_no_na = df.rename(columns={df.columns[0]: "measure_and_missing_rates"}, inplace=True)
        # Drop all NA values from the DataFrame
        df_no_na = df.dropna()
        
        # Save the cleaned DataFrame into the new dictionary with "_no_na" appended to the key
        dataframes_no_na[f"{key}_no_na"] = df_no_na
    return dataframes_no_na






In [4]:
# # Initialize a new dictionary to store DataFrames with specific rows removed
# dataframes_clean_data = {}

######
######
######
######
# # Define the measure type you want to drop (e.g., "MAE", "RMSE")
# measure_to_drop = "MAE"  # Change this value to drop a different measure type
def drop_specific_rows(dataframes_no_na, dataframes_clean_data, measure_to_drop):
    # Iterate through each DataFrame in the dataframes_no_na dictionary
    for key, df in dataframes_no_na.items():
        # Drop rows that contain the measure_to_drop in their names
        df_clean = df[~df.iloc[:, 0].str.contains(measure_to_drop, na=False)]
        
        # Remove the "_no_na" suffix from the key and replace it with "_clean"
        new_key = key.replace('_no_na', '_clean')
        
        # Save the cleaned DataFrame into the new dictionary
        dataframes_clean_data[new_key] = df_clean

    return dataframes_clean_data



In [5]:


def cleaning_for_unique_imputers(dataframes_clean_data):
    # Iterate through each DataFrame in the dataframes_clean_data dictionary
    for key, df in dataframes_clean_data.items():
        # Create a dictionary to keep track of unique columns by imputer name
        unique_imputers = {}
        
        # Iterate through each column in the DataFrame
        for col in df.columns:
            # Extract the imputer part from the column name using a split
            if "measure_and_missing_rates" in col:
                imputer_name="measure_and_missing_rates"
                unique_imputers[imputer_name] = col  # Store the original column name as value
            elif "Imputer(" in col:
                imputer_name = col.split("Imputer(")[1].split(")_Estim")[0]  # Extract text between "Imputer(" and ")_Estim"
                
                # If this imputer name has not been added to unique_imputers, add it
                if imputer_name not in unique_imputers:
                    unique_imputers[imputer_name] = col  # Store the original column name as value

        # Create a list of columns to keep (unique imputers only)
        columns_to_keep = list(unique_imputers.values())

        # Filter the DataFrame to keep only the columns with unique imputer names
        df_cleaned = df[columns_to_keep]
        
        # Update the DataFrame in the dictionary with the cleaned version
        dataframes_clean_data[key] = df_cleaned
    return dataframes_clean_data






In [6]:

def update_column_names(dataframes_clean_data):
    # Iterate through each DataFrame in the dataframes_clean_data dictionary
    for key, df in dataframes_clean_data.items():
        # Create a dictionary to map old column names to new column names (imputer names)
        new_column_names = {}
        
        # Iterate through each column in the DataFrame
        for col in df.columns:
            # Extract the imputer part from the column name using a split

            if "Imputer(" in col:
                imputer_name = col.split("Imputer(")[1].split(")_Estim")[0]  # Extract text between "Imputer(" and ")_Estim"
                # Map the old column name to the new imputer name
                new_column_names[col] = imputer_name
        
        # Rename the columns in the DataFrame
        df.rename(columns=new_column_names, inplace=True)
        
        # Update the DataFrame in the dictionary with the renamed columns
        dataframes_clean_data[key] = df
    return dataframes_clean_data






In [7]:
def drop_columns_for_baseline(dataframes_clean_data):
    # Drop specific columns from the "baseline_clean" dataframe
    dataframes_clean_data["baseline_clean"] = dataframes_clean_data["baseline_clean"].drop(columns=["KNN-Imputer", "RF-Imputer"])
    return dataframes_clean_data




In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt




def complete_statistics_plots(dataset_name,imputation_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}

    # Define the directory to save the stats and graphs
    output_directory = os.path.join(imputation_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Function to extract missing rate from the "measure_and_missing_rates" column
    def extract_missing_rate(measure_string):
        return measure_string.split('_')[-1]




    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline_clean" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].apply(extract_missing_rate).unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'].str.contains(f'_{missing_rate}')]
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].apply(extract_missing_rate).unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'].str.contains(f'_{missing_rate}')]

    # Calculate descriptive statistics and save to CSV files
    for key, rates_data in data_by_missingrates.items():
        for missing_rate, rate_df in rates_data.items():
            # Calculate descriptive statistics
            stats = rate_df.describe()
            # Save to CSV
            stats.to_csv(os.path.join(output_directory, f'{key}_missing_rate_{missing_rate}_stats.csv'))

    for missing_rate, rate_df in baseline_data_by_missingness.items():
        # Calculate descriptive statistics for baseline
        stats = rate_df.describe()
        # Save to CSV
        stats.to_csv(os.path.join(output_directory, f'baseline_clean_missing_rate_{missing_rate}_stats.csv'))

    # Define line styles and markers for diversity
    line_styles = ['-', '--', '-.', ':']
    markers = ['o', 's', '^', 'D', 'x', '*']  # Different markers




    # Generate graphs for each feature selection dataframe
    def plot_imputer_values_vs_missing_rates(feature_selection_key, graph_name, y_axis_label):
        plt.figure(figsize=(10, 6))
        
        # Extract the missing rates and convert them to float for sorting and plotting

        missing_rates = sorted([float(rate) for rate in baseline_data_by_missingness.keys()])

        # Plot baseline data (without feature selection)
        for idx, imputer in enumerate(baseline_data_by_missingness[str(missing_rates[0])].columns[1:]):  # Skip 'measure_and_missing_rates'
            baseline_values = [baseline_data_by_missingness[str(missing_rate)][imputer].mean() for missing_rate in missing_rates]
            plt.plot(
                missing_rates,
                baseline_values,
                marker=markers[idx % len(markers)],
                # linestyle=line_styles[idx % len(line_styles)],
                linestyle=line_styles[(idx + len(baseline_data_by_missingness)) % len(line_styles)],
                label=f'{imputer} (no FS)'
            )
        
        # Plot data for the given feature selection key
        for idx, imputer in enumerate(data_by_missingrates[feature_selection_key][str(missing_rates[0])].columns[1:]):  # Skip 'measure_and_missing_rates'
            imputer_values = [data_by_missingrates[feature_selection_key][str(missing_rate)][imputer].mean() for missing_rate in missing_rates]
            plt.plot(
                missing_rates,
                imputer_values,
                marker=markers[idx % len(markers)],
                # linestyle=line_styles[(idx + len(baseline_data_by_missingness)) % len(line_styles)],
                label=imputer
            )
        
        # Set plot labels and title
        plt.xlabel('Missing Rates')
        plt.ylabel(y_axis_label)
        plt.title(f'{graph_name} Imputer Values vs Missing Rates')
        plt.xticks(missing_rates, [f'{int(rate*100)}%' for rate in missing_rates])
        plt.legend()
        plt.grid(True)
        
        # Save the plot to the output directory
        plt.savefig(os.path.join(output_directory, f'{graph_name}_Imputer_Values_vs_Missing_Rates.png'))
        plt.close()

    # Example usage of the plot function for each feature selection dataframe
    for key in data_by_missingrates.keys():
        formatted_key = key.replace("_clean", "").replace("_", " ")
        # Capitalize each word for a title
        title = formatted_key.title()

        title = dataset_name+" "+title +" FS " +missing_mechanism
        plot_imputer_values_vs_missing_rates(key, graph_name=title, y_axis_label=y_axis_label)



In [9]:


directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#26_Cleveland_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MAR"]



# directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#26_Cleveland_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#26_Cleveland_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MCAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#26_Cleveland_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MNAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#27_Diabetic_Retinopathy_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#27_Diabetic_Retinopathy_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MCAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#27_Diabetic_Retinopathy_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MNAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#28_Wisconsin_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#28_Wisconsin_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MCAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#28_Wisconsin_9_10_24_imp_pred_eval_10_trials_base_fs_pipelines/MNAR",
#                 ]


for dir in directory_list:
    # Initialize the dictionary to store DataFrames
    dataframes = {}
    # Initialize a new dictionary to store DataFrames without NA values
    dataframes_no_na = {}
    # Define the measure type you want to drop (e.g., "MAE", "RMSE")
    measure_to_drop = "MAE"  # Change this value to drop a different measure type
    # Initialize a new dictionary to store DataFrames with specific rows removed
    dataframes_clean_data = {}





    # MISSING MECHANISM Experiment Directory
    imputation_directory = dir
    y_axis_label="RMSE"
    missing_mechanism=os.path.basename(imputation_directory)

    #Since we are using the same directory per dataset, we want to look at different files, the two files are shown below
    #imputation_eval_final_results or prediction_metrics_final_results
    experiment_type_name="imputation_eval_final_results"

    #name of the directory where the stats will be saved
    stats_directory_name="stats_imputation_level"

    # Extract the directory name before MAR or MCAR or MNAR
    parent_directory = os.path.basename(os.path.dirname(imputation_directory))
    dataset_name = parent_directory.split('_')[1]  # Assuming "Cleveland" is always the second part



    dataframes=load_experiment_data(dataframes,imputation_directory,experiment_type_name)
    dataframes_no_na=drop_na(dataframes, dataframes_no_na)
    dataframes_clean_data=drop_specific_rows(dataframes_no_na, dataframes_clean_data, measure_to_drop)
    dataframes_clean_data=cleaning_for_unique_imputers(dataframes_clean_data)

    dataframes_clean_data=update_column_names(dataframes_clean_data)

    # dataframes_clean_data=drop_columns_for_baseline(dataframes_clean_data)
    complete_statistics_plots(dataset_name,imputation_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name)




In [10]:
dataframes_clean_data["RFE_clean"].head(10)

Unnamed: 0,measure_and_missing_rates,KNN-Imputer,Mean-Imputer,RF-Imputer
0,RMSE_0.1,0.301408,0.338129,0.299207
4,RMSE_0.2,0.312554,0.325977,0.301294
8,RMSE_0.3,0.340422,0.341739,0.335544
12,RMSE_0.4,0.329613,0.318981,0.324104
16,RMSE_0.5,0.30778,0.306817,0.32263
20,RMSE_0.1,0.308116,0.303318,0.289671
24,RMSE_0.2,0.27985,0.293049,0.273059
28,RMSE_0.3,0.339849,0.342907,0.335443
32,RMSE_0.4,0.352865,0.353004,0.366219
36,RMSE_0.5,0.290558,0.299224,0.329875
