# # Baseline and Feature Selection for Prediction Value vs Missing Rates Analysis

In [1]:
import os
import pandas as pd

# # Initialize the dictionary to store DataFrames
# dataframes = {}

# Function to load data from the experiment directory
def load_experiment_data(dataframes,directory,experiment_type_name):
    # Traverse through all files and directories in the given directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check for baseline data in the baseline folder
            if "baseline_pipeline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)
                # Load the baseline data into the 'baseline' DataFrame
                if 'baseline' not in dataframes:
                    dataframes['baseline'] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing baseline DataFrame
                    data = pd.read_csv(file_path)
                    dataframes['baseline'] = pd.concat([dataframes['baseline'], data], ignore_index=True)

            # Check for data in fs_pipeline folder
            elif "fs_pipeline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)

                prefix = file.split(experiment_type_name)[0].rstrip('_')  # Remove trailing '_'
                
                # Load the data into the appropriate DataFrame named by the prefix
                if prefix not in dataframes:
                    dataframes[prefix] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing DataFrame
                    data = pd.read_csv(file_path)
                    dataframes[prefix] = pd.concat([dataframes[prefix], data], ignore_index=True)
    return dataframes

    





# code takes directory that contains all the data.
# Two directories, Baseline and  multiple directories for different types of feature selection





In [2]:
import re

def clean_data(dataframes,dataframes_clean_data, measures_to_keep):
    
    # Iterate through each DataFrame in the original dictionary
    for key, df in dataframes.items():
        # Rename the first column
        df = df.rename(columns={df.columns[0]: "measure_and_missing_rates"})
        
        
        # Rename the other columns according to the pattern
        new_columns = {}
        for col in df.columns[1:]:
            # Extract X and Y from "Imputer(X)_Estim(Y)"
            pattern = r"Imputer\((.*?)\)_Estim\((.*?)\)"
            match = re.match(pattern, col)
            if match:
                X = match.group(1)
                Y = match.group(2)
                new_col = f"{X}_{Y}"
                new_columns[col] = new_col
            else:
                # If the pattern doesn't match, keep the original column name
                new_columns[col] = col
        df = df.rename(columns=new_columns)


        print(df)
        print("{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}")
        # Keep only rows where "measure_and_missing_rates" contains the string
        df_clean = df[df["measure_and_missing_rates"].str.contains(measures_to_keep, na=False)]
        print(measures_to_keep)
        print(df_clean)

        
        print("==============================================")
        df_clean['measure_and_missing_rates'] = df_clean['measure_and_missing_rates'].apply(lambda x: x.split('_')[-1] if isinstance(x, str) else x)


        
        # Drop all NA values from the DataFrame
        df_clean = df_clean.dropna()
        
        # # Drop the rows containing measures to drop
        # for measure in measures_to_drop:
        #     df_clean = df_clean[~df_clean["measure_and_missing_rates"].str.contains(measure, na=False)]
        
        print(df_clean)
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        


        
        # Store the cleaned DataFrame
        dataframes_clean_data[f"{key}_clean"] = df_clean
        
    return dataframes_clean_data


In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt




def complete_statistics_plots(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}

    # Define the directory to save the stats and graphs
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)






    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline_clean" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]


                
    print(baseline_data_by_missingness)
    # Calculate descriptive statistics and save to CSV files
    for key, rates_data in data_by_missingrates.items():
        for missing_rate, rate_df in rates_data.items():
            # Calculate descriptive statistics
            stats = rate_df.describe()
            # Save to CSV
            stats.to_csv(os.path.join(output_directory, f'{key}_missing_rate_{missing_rate}_stats.csv'))

    for missing_rate, rate_df in baseline_data_by_missingness.items():
        # Calculate descriptive statistics for baseline
        stats = rate_df.describe()
        # Save to CSV
        stats.to_csv(os.path.join(output_directory, f'baseline_clean_missing_rate_{missing_rate}_stats.csv'))

    # Define line styles and markers for diversity
    line_styles = ['-', '--', '-.', ':']
    markers = ['o', 's', '^', 'D', 'x', '*']  # Different markers




    # Generate graphs for each feature selection dataframe
    def plot_imputer_values_vs_missing_rates(feature_selection_key, graph_name, y_axis_label):
        plt.figure(figsize=(10, 6))
        
        # Extract the missing rates and convert them to float for sorting and plotting

        missing_rates = sorted([float(rate) for rate in baseline_data_by_missingness.keys()])

        # Plot baseline data (without feature selection)
        for idx, imputer in enumerate(baseline_data_by_missingness[str(missing_rates[0])].columns[1:]):  # Skip 'measure_and_missing_rates'
            baseline_values = [baseline_data_by_missingness[str(missing_rate)][imputer].mean() for missing_rate in missing_rates]
            plt.plot(
                missing_rates,
                baseline_values,
                marker=markers[idx % len(markers)],
                # linestyle=line_styles[idx % len(line_styles)],
                linestyle=line_styles[(idx + len(baseline_data_by_missingness)) % len(line_styles)],
                label=f'{imputer} (no FS)'
            )
        
        # Plot data for the given feature selection key
        for idx, imputer in enumerate(data_by_missingrates[feature_selection_key][str(missing_rates[0])].columns[1:]):  # Skip 'measure_and_missing_rates'
            imputer_values = [data_by_missingrates[feature_selection_key][str(missing_rate)][imputer].mean() for missing_rate in missing_rates]
            plt.plot(
                missing_rates,
                imputer_values,
                marker=markers[idx % len(markers)],
                # linestyle=line_styles[(idx + len(baseline_data_by_missingness)) % len(line_styles)],
                label=imputer
            )
        
        # Set plot labels and title
        plt.xlabel('Missing Rates')
        plt.ylabel(y_axis_label)
        plt.title(f'{graph_name} Imputer Values vs Missing Rates')
        plt.xticks(missing_rates, [f'{int(rate*100)}%' for rate in missing_rates])
        plt.legend()
        plt.grid(True)
        
        # Save the plot to the output directory
        plt.savefig(os.path.join(output_directory, f'{graph_name}_Imputer_Values_vs_Missing_Rates.png'))
        plt.close()

    # Example usage of the plot function for each feature selection dataframe
    for key in data_by_missingrates.keys():
        formatted_key = key.replace("_clean", "").replace("_", " ")
        # Capitalize each word for a title
        title = formatted_key.title()

        title = dataset_name+" "+title +" FS " +missing_mechanism
        plot_imputer_values_vs_missing_rates(key, graph_name=title, y_axis_label=y_axis_label)



In [4]:
# def visualize_FS_imputation_pred(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name):
#     # Initialize dictionaries to store data split by missing rates
#     data_by_missingrates = {}
#     baseline_data_by_missingness = {}

#     # Define the directory to save the stats and graphs
#     output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
#     os.makedirs(output_directory, exist_ok=True)

#     # Iterate through each DataFrame in dataframes_clean_data
#     for key, df in dataframes_clean_data.items():
#         # Check if it is the baseline dataframe
#         if "baseline_clean" in key:
#             # Split baseline data by missing rate
#             for missing_rate in df['measure_and_missing_rates'].unique():
#                 baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
#         else:
#             # Initialize a dictionary for each feature selection dataframe
#             data_by_missingrates[key] = {}
#             # Split feature selection data by missing rate
#             for missing_rate in df['measure_and_missing_rates'].unique():
#                 data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

#     print(baseline_data_by_missingness)
#     # print(data_by_missingrates)













In [5]:
import os
import matplotlib.pyplot as plt

import os
import matplotlib.pyplot as plt

def visualize_FS_imputation_pred(dataset_name, prediction_metrics_directory, dataframes_clean_data, missing_mechanism, y_axis_label, stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}

    # Define the directory to save the stats and graphs
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline_clean" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

    # Plotting each experiment pipeline along with the baseline
    for imputation_prediction_model in baseline_data_by_missingness[list(baseline_data_by_missingness.keys())[0]].columns[1:]:  # Iterate over each column except the first
        plt.figure(figsize=(10, 6))

        # Plot the baseline
        baseline_measures = []
        missing_rates = []
        for missing_rate, baseline_df in baseline_data_by_missingness.items():
            if imputation_prediction_model in baseline_df.columns:
                baseline_measures.append(baseline_df[imputation_prediction_model].values[0])  # Assuming a single value per missing rate
                missing_rates.append(float(missing_rate) * 100)  # Convert to percentage for plotting
            else:
                print(f"Warning: '{imputation_prediction_model}' not found in baseline data for missing rate {missing_rate}.")
                continue

        plt.plot(missing_rates, baseline_measures, label=f'Baseline ({imputation_prediction_model})', marker='o')

        # Plot each feature selection pipeline
        for fs_key, fs_dict in data_by_missingrates.items():
            fs_measures = []
            for missing_rate in missing_rates:
                missing_rate_key = str(missing_rate / 100)  # Convert back to the original format
                if missing_rate_key in fs_dict and imputation_prediction_model in fs_dict[missing_rate_key].columns:
                    fs_df = fs_dict[missing_rate_key]
                    fs_measures.append(fs_df[imputation_prediction_model].values[0])  # Assuming a single value per missing rate
                else:
                    print(f"Warning: Missing data for {fs_key} at missing rate {missing_rate}.")
                    fs_measures.append(None)  # Handle missing values

            plt.plot(missing_rates, fs_measures, label=f'{fs_key} ({imputation_prediction_model})', marker='x')

        # Add labels, title, and legend
        plt.xlabel('Missing Rate (%)')
        plt.ylabel(y_axis_label)
        plt.title(f'{dataset_name} - {imputation_prediction_model} vs Missing Rate')
        plt.legend()
        plt.grid(True)
        
        # plt.ylim(y_min, y_max)


        # Save the plot
        output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_vs_Missing_Rate.png')
        plt.savefig(output_file)
        plt.close()

    print(f'Plots saved in {output_directory}')









In [6]:
import os
import pandas as pd
from scipy import stats

def generate_combined_csv_per_graph(dataset_name, prediction_metrics_directory, dataframes_clean_data, stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}

    # Define the directory to save the stats and CSV files
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline_clean" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

    # Iterate over each imputation+prediction model combination to create combined CSVs
    for imputation_prediction_model in baseline_data_by_missingness[list(baseline_data_by_missingness.keys())[0]].columns[1:]:
        # Prepare a DataFrame to store performance values for each missing rate
        values_data = {'Missing Rates': ['10%', '20%', '30%', '40%', '50%']}
        
        # Extract and calculate values for baseline pipeline
        baseline_values = []
        for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
            if missing_rate in baseline_data_by_missingness:
                baseline_value = baseline_data_by_missingness[missing_rate][imputation_prediction_model].mean()
                values_data['Baseline (' + imputation_prediction_model + ')'] = [baseline_value] if missing_rate == '0.1' else values_data['Baseline (' + imputation_prediction_model + ')'] + [baseline_value]
                baseline_values.append(baseline_value)

        # Extract and calculate values for each feature selection pipeline
        fs_values_dict = {}
        for fs_key, fs_dict in data_by_missingrates.items():
            fs_values = []
            for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
                if missing_rate in fs_dict and imputation_prediction_model in fs_dict[missing_rate].columns:
                    fs_value = fs_dict[missing_rate][imputation_prediction_model].mean()
                    if fs_key not in values_data:
                        values_data[fs_key] = [fs_value]
                    else:
                        values_data[fs_key].append(fs_value)
                    fs_values.append(fs_value)
                else:
                    fs_values.append(None)
            fs_values_dict[fs_key] = fs_values

        # Convert values_data to DataFrame
        values_df = pd.DataFrame(values_data)

        # Prepare a DataFrame for descriptive statistics
        stats_data = {'Missing Rates': ['Mean', 'Std', 'P-Value']}
        
        # Calculate descriptive stats for the baseline
        baseline_mean = pd.Series(baseline_values).mean()
        baseline_std = pd.Series(baseline_values).std()
        
        stats_data['Baseline (' + imputation_prediction_model + ')'] = [baseline_mean, baseline_std, '']  # P-Value not needed for baseline

        # Calculate descriptive stats and p-values for each feature selection pipeline
        for fs_key, fs_values in fs_values_dict.items():
            fs_mean = pd.Series(fs_values).mean()
            fs_std = pd.Series(fs_values).std()
            t_stat, p_value = stats.ttest_ind(baseline_values, [v for v in fs_values if v is not None], equal_var=False)

            # Store stats in the stats_data dictionary
            stats_data[fs_key] = [fs_mean, fs_std, p_value]

        # Convert stats_data to DataFrame
        stats_df = pd.DataFrame(stats_data)

        # Combine both DataFrames into one with an empty row in between
        combined_df = pd.concat([values_df, pd.DataFrame([[''] * len(values_df.columns)], columns=values_df.columns), stats_df], ignore_index=True)

        # Save combined DataFrame to CSV
        output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_combined_stats.csv')
        combined_df.to_csv(output_file, index=False)
        print(f'Statistics and comparison results saved in {output_file}')



In [7]:
# stats_directory_name="combined_stats_with_pvalues"+stats_directory_name

In [8]:
# import os
# import pandas as pd
# from scipy import stats
# from itertools import combinations

# def generate_combined_csv_per_graph_with_pvalues_all_vs_all(dataset_name, prediction_metrics_directory, dataframes_clean_data, stats_directory_name):
#     # Initialize dictionaries to store data split by missing rates
#     data_by_missingrates = {}
    
#     # Define the directory to save the stats and CSV files
#     stats_directory_name="combined_stats_with_pvalues"+stats_directory_name
#     output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
#     os.makedirs(output_directory, exist_ok=True)

#     # Iterate through each DataFrame in dataframes_clean_data
#     for key, df in dataframes_clean_data.items():
#         # Initialize a dictionary for each pipeline (baseline or feature selection)
#         data_by_missingrates[key] = {}
#         # Split data by missing rate
#         for missing_rate in df['measure_and_missing_rates'].unique():
#             data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

#     # Iterate over each imputation+prediction model combination to create combined CSVs
#     for imputation_prediction_model in data_by_missingrates[list(data_by_missingrates.keys())[0]]['0.1'].columns[1:]:
#         # Prepare a DataFrame to store performance values for each missing rate
#         values_data = {'Missing Rates': ['10%', '20%', '30%', '40%', '50%']}
        
#         # Extract and calculate values for each pipeline
#         pipeline_values_dict = {}
#         for pipeline_key, pipeline_dict in data_by_missingrates.items():
#             pipeline_values = []
#             for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
#                 if missing_rate in pipeline_dict and imputation_prediction_model in pipeline_dict[missing_rate].columns:
#                     pipeline_value = pipeline_dict[missing_rate][imputation_prediction_model].mean()
#                     if pipeline_key not in values_data:
#                         values_data[pipeline_key] = [pipeline_value]
#                     else:
#                         values_data[pipeline_key].append(pipeline_value)
#                     pipeline_values.append(pipeline_value)
#                 else:
#                     pipeline_values.append(None)
#             pipeline_values_dict[pipeline_key] = pipeline_values

#         # Convert values_data to DataFrame
#         values_df = pd.DataFrame(values_data)

#         # Prepare a DataFrame for descriptive statistics and pairwise p-values
#         stats_data = {'Missing Rates': ['Mean', 'Std']}

#         # Calculate means and standard deviations for each pipeline
#         for pipeline_key, pipeline_values in pipeline_values_dict.items():
#             pipeline_mean = pd.Series(pipeline_values).mean()
#             pipeline_std = pd.Series(pipeline_values).std()
#             stats_data[pipeline_key] = [pipeline_mean, pipeline_std]

#         # Now calculate p-values for every pairwise comparison between pipelines
#         pairwise_p_values = {}
#         for (pipeline1, values1), (pipeline2, values2) in combinations(pipeline_values_dict.items(), 2):
#             t_stat, p_value = stats.ttest_ind([v for v in values1 if v is not None],
#                                               [v for v in values2 if v is not None],
#                                               equal_var=False)
#             # Bold p-values less than 0.05
#             p_value_str = f"**{p_value:.4f}**" if p_value < 0.05 else f"{p_value:.4f}"
#             pairwise_p_values[f'P-Value ({pipeline1} vs {pipeline2})'] = p_value_str

#         # Add the p-values to the stats_data dictionary
#         stats_data.update(pairwise_p_values)

#         # Convert stats_data to DataFrame
#         stats_df = pd.DataFrame(stats_data)

#         # Combine both DataFrames into one with an empty row in between
#         combined_df = pd.concat([values_df, pd.DataFrame([[''] * len(values_df.columns)], columns=values_df.columns), stats_df], ignore_index=True)

#         # Save combined DataFrame to CSV
#         output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_combined_stats.csv')
#         combined_df.to_csv(output_file, index=False)
#         print(f'Statistics and comparison results saved in {output_file}')


In [9]:
import os
import pandas as pd
from scipy import stats
from itertools import combinations

def generate_combined_csv_per_graph_with_pvalues_all_vs_all(dataset_name, prediction_metrics_directory, dataframes_clean_data, stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    
    # Define the directory to save the stats and CSV files
    stats_directory_name = "combined_stats_with_pvalues" + stats_directory_name
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        data_by_missingrates[key] = {}
        # Split data by missing rate
        for missing_rate in df['measure_and_missing_rates'].unique():
            data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

    # Iterate over each imputation+prediction model combination to create combined CSVs
    for imputation_prediction_model in data_by_missingrates[list(data_by_missingrates.keys())[0]]['0.1'].columns[1:]:
        # Prepare a DataFrame to store performance values for each missing rate
        values_data = {'Missing Rates': ['10%', '20%', '30%', '40%', '50%']}
        
        # Extract and calculate values for each pipeline
        pipeline_values_dict = {}
        for pipeline_key, pipeline_dict in data_by_missingrates.items():
            pipeline_values = []
            for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
                if missing_rate in pipeline_dict and imputation_prediction_model in pipeline_dict[missing_rate].columns:
                    pipeline_value = pipeline_dict[missing_rate][imputation_prediction_model].mean()
                    if pipeline_key not in values_data:
                        values_data[pipeline_key] = [pipeline_value]
                    else:
                        values_data[pipeline_key].append(pipeline_value)
                    pipeline_values.append(pipeline_value)
                else:
                    pipeline_values.append(None)
            pipeline_values_dict[pipeline_key] = pipeline_values

        # Convert values_data to DataFrame for the actual performance values
        values_df = pd.DataFrame(values_data)

        # Now prepare the p-values for pairwise comparisons in a separate table
        p_values_data = {'Missing Rates': ['10%', '20%', '30%', '40%', '50%']}
        for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
            for (col1, values1), (col2, values2) in combinations(pipeline_values_dict.items(), 2):
                t_stat, p_value = stats.ttest_ind([v for v in values1 if v is not None],
                                                  [v for v in values2 if v is not None],
                                                  equal_var=False)
                p_value_str = f"**{p_value:.4f}**" if p_value < 0.05 else f"{p_value:.4f}"
                pairwise_key = f'P-Value ({col1} vs {col2})'
                
                # For each missing rate, append the corresponding p-value
                if pairwise_key not in p_values_data:
                    p_values_data[pairwise_key] = [p_value_str]
                else:
                    p_values_data[pairwise_key].append(p_value_str)

        # Convert p_values_data to DataFrame for the p-values
        p_values_df = pd.DataFrame(p_values_data)

        # Combine the actual values and the p-values into one DataFrame, separated by an empty row
        combined_df = pd.concat([values_df, pd.DataFrame([[''] * len(values_df.columns)], columns=values_df.columns), p_values_df], ignore_index=True)

        # Save combined DataFrame to CSV
        output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_combined_stats.csv')
        combined_df.to_csv(output_file, index=False)
        print(f'Statistics and comparison results saved in {output_file}')


In [11]:


directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MAR"]
# measures=["roc_auc","accuracy","f1_score"]
measures=["accuracy"]


# directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MCAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MNAR"

#                 ]


for dir in directory_list:
    for measure in measures:
        # Initialize the dictionary to store DataFrames
        dataframes = {}

        # Define the measure type you want to drop (e.g., "MAE", "RMSE")
        #options: "auc_roc","accuracy","f1_score"
        # measure_to_drop = ["auc_roc","accuracy"]  # Change this value to drop a different measure type
        # Initialize a new dictionary to store DataFrames with specific rows removed
        dataframes_clean_data = {}





        # MISSING MECHANISM Experiment Directory
        prediction_metrics_directory = dir
        print(measure)
        y_axis_label=measure
        missing_mechanism=os.path.basename(prediction_metrics_directory)

        #Since we are using the same directory per dataset, we want to look at different files, the two files are shown below
        #imputation_eval_final_results or prediction_metrics_final_results
        experiment_type_name="prediction_metrics_final_results"
        #name of the directory where the stats will be saved
        generic_stats_directory_name="stats_prediction_level"
        complete_stats_directory_name=f"{generic_stats_directory_name}/{measure}_complete_{generic_stats_directory_name}"
        stats_directory_name=f"{generic_stats_directory_name}/{measure}_{generic_stats_directory_name}"

        # Extract the directory name before MAR or MCAR or MNAR
        parent_directory = os.path.basename(os.path.dirname(prediction_metrics_directory))
        dataset_name = parent_directory.split('_')[1]  # Assuming "Cleveland" is always the second part







        dataframes=load_experiment_data(dataframes,prediction_metrics_directory,experiment_type_name)
        dataframes_clean_data=clean_data(dataframes, dataframes_clean_data, y_axis_label)
        # print(dataframes_clean_data)


        complete_statistics_plots(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,complete_stats_directory_name)

        visualize_FS_imputation_pred(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name)
        generate_combined_csv_per_graph(
        dataset_name,
        prediction_metrics_directory,
        dataframes_clean_data,
        stats_directory_name)

        generate_combined_csv_per_graph_with_pvalues_all_vs_all(dataset_name, prediction_metrics_directory, dataframes_clean_data, stats_directory_name)
        
        

        
        
        


        # This is an alternative to specifically set the y_min and y_max values
        # visualize_FS_imputation_pred(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name, 0.6,1)
        # Uncomment this in the function
        # plt.ylim(y_min, y_max)

     





accuracy
   measure_and_missing_rates  KNN-Imputer_SV-Classifier  \
0                roc_auc_0.1                   0.887840   
1               accuracy_0.1                   0.801960   
2               f1_score_0.1                   0.780308   
3                roc_auc_0.2                   0.881860   
4               accuracy_0.2                   0.818520   
5               f1_score_0.2                   0.798712   
6                roc_auc_0.3                   0.864990   
7               accuracy_0.3                   0.797940   
8               f1_score_0.3                   0.773127   
9                roc_auc_0.4                   0.835890   
10              accuracy_0.4                   0.767690   
11              f1_score_0.4                   0.729971   
12               roc_auc_0.5                   0.847970   
13              accuracy_0.5                   0.771620   
14              f1_score_0.5                   0.745465   
15               roc_auc_0.1                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['measure_and_missing_rates'] = df_clean['measure_and_missing_rates'].apply(lambda x: x.split('_')[-1] if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['measure_and_missing_rates'] = df_clean['measure_and_missing_rates'].apply(lambda x: x.split('_')[-1] if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

Plots saved in /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MAR/stats_prediction_level/accuracy_stats_prediction_level
Statistics and comparison results saved in /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MAR/stats_prediction_level/accuracy_stats_prediction_level/prelimdata_KNN-Imputer_SV-Classifier_combined_stats.csv
Statistics and comparison results saved in /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature S

In [14]:
dataframes_clean_data["chi_square_clean"].head(100) 

Unnamed: 0,measure_and_missing_rates,KNN-Imputer_SV-Classifier,KNN-Imputer_RF-Classifier,KNN-Imputer_XGB-Classifier,Mean-Imputer_SV-Classifier,Mean-Imputer_RF-Classifier,Mean-Imputer_XGB-Classifier,RF-Imputer_SV-Classifier,RF-Imputer_RF-Classifier,RF-Imputer_XGB-Classifier
1,0.1,0.81196,0.8153,0.80874,0.81541,0.80864,0.79876,0.80852,0.82898,0.79185
4,0.2,0.80495,0.79148,0.76127,0.80195,0.82242,0.7816,0.79839,0.79507,0.79552
7,0.3,0.8247,0.80184,0.78149,0.81161,0.81518,0.78161,0.80758,0.80472,0.80496
10,0.4,0.80471,0.77781,0.72368,0.80459,0.77437,0.73414,0.79114,0.78438,0.76805
13,0.5,0.80563,0.82542,0.80126,0.79886,0.82518,0.80126,0.80805,0.78804,0.79472
16,0.1,0.82897,0.80852,0.77185,0.81553,0.81551,0.76853,0.81529,0.82596,0.80897
19,0.2,0.81554,0.82229,0.76161,0.81207,0.78852,0.77173,0.79863,0.79563,0.78542
22,0.3,0.75782,0.77471,0.71103,0.78126,0.77817,0.77517,0.78115,0.75816,0.74759
25,0.4,0.80185,0.76851,0.7716,0.81196,0.79541,0.7785,0.80518,0.7953,0.77827
28,0.5,0.76139,0.79126,0.75736,0.78484,0.77081,0.74689,0.77449,0.76127,0.71689
