## Per Fold Baseline and Feature Selection for Prediction Value vs Missing Rates Analysis

In [1]:
import os
import pandas as pd

# # Initialize the dictionary to store DataFrames
# dataframes = {}

# Function to load data from the experiment directory
def load_experiment_data(dataframes,directory,experiment_type_name):
    # Traverse through all files and directories in the given directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check for baseline data in the baseline folder
            if "baseline_pipeline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)
                print(file_path)
                # Load the baseline data into the 'baseline' DataFrame
                if 'baseline' not in dataframes:
                    dataframes['baseline'] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing baseline DataFrame
                    data = pd.read_csv(file_path)
                    
                    dataframes['baseline'] = pd.concat([dataframes['baseline'], data], ignore_index=True)

            # Check for data in fs_pipeline folder
            elif "fs_pipeline" in root and experiment_type_name in file and file.endswith('.csv'):
                file_path = os.path.join(root, file)

                prefix = file.split(experiment_type_name)[0].rstrip('_')  # Remove trailing '_'
                
                # Load the data into the appropriate DataFrame named by the prefix
                if prefix not in dataframes:
                    dataframes[prefix] = pd.read_csv(file_path)
                else:
                    # Append new data to the existing DataFrame
                    data = pd.read_csv(file_path)
                    dataframes[prefix] = pd.concat([dataframes[prefix], data], ignore_index=True)
    return dataframes

    





# code takes directory that contains all the data.
# Two directories, Baseline and  multiple directories for different types of feature selection





In [2]:
import re

def clean_data(dataframes,dataframes_clean_data, measures_to_keep):
    
    # Iterate through each DataFrame in the original dictionary
    for key, df in dataframes.items():
        # Rename the first column
        df = df.rename(columns={df.columns[0]: "measure_and_missing_rates"})
        
        
        # Rename the other columns according to the pattern
        new_columns = {}
        for col in df.columns[1:]:
            # Extract X and Y from "Imputer(X)_Estim(Y)"
            pattern = r"Imputer\((.*?)\)_Estim\((.*?)\)"
            match = re.match(pattern, col)
            if match:
                X = match.group(1)
                Y = match.group(2)
                new_col = f"{X}_{Y}"
                new_columns[col] = new_col
            else:
                # If the pattern doesn't match, keep the original column name
                new_columns[col] = col
        df = df.rename(columns=new_columns)

        # Keep only rows where "measure_and_missing_rates" contains the string
        df_clean = df[df["measure_and_missing_rates"].str.contains(measures_to_keep, na=False)]


        
        print("==============================================")
        df_clean['measure_and_missing_rates'] = df_clean['measure_and_missing_rates'].apply(lambda x: x.split('_')[-1] if isinstance(x, str) else x)


        
        # Drop all NA values from the DataFrame
        df_clean = df_clean.dropna()
        
        # # Drop the rows containing measures to drop
        # for measure in measures_to_drop:
        #     df_clean = df_clean[~df_clean["measure_and_missing_rates"].str.contains(measure, na=False)]
        

        


        
        # Store the cleaned DataFrame
        dataframes_clean_data[f"{key}"] = df_clean
    
    for key, df_clean in dataframes_clean_data.items():
        # Sort each dataframe by the "measure_and_missing_rates" column
        dataframes_clean_data[f"{key}"] = df_clean.sort_values(by="measure_and_missing_rates", ascending=True)


        
        
    return dataframes_clean_data


In [3]:
import os
import matplotlib.pyplot as plt

def visualize_FS_imputation_pred(dataset_name, prediction_metrics_directory, dataframes_clean_data, missing_mechanism, y_axis_label, stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}


    # Initialize dictionaries to store mean values
    baseline_means = {}
    fs_means = {}

    # Define the directory to save the stats and graphs
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
                # Calculate the mean for each column (ignoring the first column which is the missing rate)
                if str(missing_rate) not in baseline_means:
                    baseline_means[str(missing_rate)] = {}
                baseline_means[str(missing_rate)] = baseline_data_by_missingness[str(missing_rate)].iloc[:, 1:].mean()
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            if key not in fs_means:
                fs_means[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
                # Calculate the mean for each feature selection method and missing rate
                fs_means[key][str(missing_rate)] = data_by_missingrates[key][str(missing_rate)].iloc[:, 1:].mean()
                

    # Plotting each experiment pipeline along with the baseline
    for imputation_prediction_model in baseline_data_by_missingness[list(baseline_data_by_missingness.keys())[0]].columns[1:]:  # Iterate over each column except the first
        plt.figure(figsize=(10, 6))

        # Plot the baseline
        baseline_measures = []
        missing_rates = []
        for missing_rate, baseline_df in baseline_data_by_missingness.items():
            if imputation_prediction_model in baseline_df.columns:
                # baseline_measures.append(baseline_df[imputation_prediction_model].values[0])  # Assuming a single value per missing rate
                baseline_measures.append(baseline_df[imputation_prediction_model].mean())  # Calculate the mean for the baseline


                missing_rates.append(float(missing_rate) * 100)  # Convert to percentage for plotting
            else:
                print(f"Warning: '{imputation_prediction_model}' not found in baseline data for missing rate {missing_rate}.")
                continue

        plt.plot(missing_rates, baseline_measures, label=f'Baseline ({imputation_prediction_model})', marker='o')

        # Plot each feature selection pipeline
        for fs_key, fs_dict in data_by_missingrates.items():
            fs_measures = []
            for missing_rate in missing_rates:
                missing_rate_key = str(missing_rate / 100)  # Convert back to the original format
                if missing_rate_key in fs_dict and imputation_prediction_model in fs_dict[missing_rate_key].columns:
                    fs_df = fs_dict[missing_rate_key]
                    # fs_measures.append(fs_df[imputation_prediction_model].values[0])  # Assuming a single value per missing rate
                    fs_measures.append(fs_df[imputation_prediction_model].mean())  # Calculate the mean for feature selection


                else:
                    print(f"Warning: Missing data for {fs_key} at missing rate {missing_rate}.")
                    fs_measures.append(None)  # Handle missing values

            plt.plot(missing_rates, fs_measures, label=f'{fs_key} ({imputation_prediction_model})', marker='x')

        # Add labels, title, and legend
        plt.xlabel('Missing Rate (%)')
        plt.ylabel(y_axis_label)
        plt.title(f'{dataset_name} - {imputation_prediction_model} vs Missing Rate')
        plt.legend()
        plt.grid(True)
        
        # plt.ylim(y_min, y_max)


        # Save the plot
        output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_vs_Missing_Rate.png')
        plt.savefig(output_file)
        plt.close()

    print(f'Plots saved in {output_directory}')
    return data_by_missingrates, baseline_data_by_missingness
    












In [4]:
import os
import pandas as pd
from itertools import combinations
from scipy.stats import ttest_rel

def compare_feature_selection_across_pipelines_with_baseline(data_by_missingrates,prediction_metrics_directory,baseline_data_by_missingness, stats_directory_name):

    # Define the directory to save the stats and graphs
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # List to store p-values for writing to a CSV later
    p_values_list = []

    # Get the list of classifiers (pipeline columns) from one of the dataframes
    any_key = next(iter(data_by_missingrates))
    any_missing_rate = next(iter(data_by_missingrates[any_key]))
    classifiers = data_by_missingrates[any_key][any_missing_rate].columns[1:]  # Skip the first column (measure_and_missing_rates)

    # Iterate through each classifier pipeline (e.g., KNN-Imputer_SV-Classifier)
    for classifier in classifiers:
        # Iterate through each missing rate (0.1, 0.2, 0.3, etc.)
        for missing_rate in data_by_missingrates[any_key].keys():
            # Dictionary to hold the data for all feature selection methods for the current classifier and missing rate
            feature_selection_data = {}
            
            # Gather the values for this classifier across all feature selection methods for this missing rate
            for feature_selection_method, missing_rate_dict in data_by_missingrates.items():
                feature_selection_data[feature_selection_method] = missing_rate_dict[missing_rate][classifier].values

            # Add baseline data for this missing rate
            if missing_rate in baseline_data_by_missingness:
                baseline_values = baseline_data_by_missingness[missing_rate][classifier].values
                feature_selection_data['baseline'] = baseline_values

            # Get all pairs of feature selection methods (including baseline) for comparison
            feature_selection_pairs = list(combinations(feature_selection_data.keys(), 2))

            # Perform pairwise comparisons for each pair of feature selection methods
            for method1, method2 in feature_selection_pairs:
                # Get the performance values for both feature selection methods
                values_method1 = feature_selection_data[method1]
                values_method2 = feature_selection_data[method2]

                # Perform the paired t-test
                stat, p_value = ttest_rel(values_method1, values_method2)

                # Append the results as a row to the list
                p_values_list.append({
                    'Classifier Pipeline': classifier,
                    'Missing Rate': missing_rate,
                    'Feature Selection Method 1': method1,
                    'Feature Selection Method 2': method2,
                    'p-value': p_value
                })

    # Convert the list of p-values to a DataFrame
    p_values_df = pd.DataFrame(p_values_list)

    # Define the output file path
    output_file = os.path.join(output_directory, 'feature_selection_comparisons_with_baseline_p_values.csv')

    # Save the DataFrame as a CSV file
    p_values_df.to_csv(output_file, index=False)

    print(f"P-values (including baseline) saved to {output_file}")


In [5]:
import os
import pandas as pd
from scipy import stats

def generate_combined_csv_per_graph(dataset_name, prediction_metrics_directory, dataframes_clean_data, stats_directory_name):
    # Initialize dictionaries to store data split by missing rates
    data_by_missingrates = {}
    baseline_data_by_missingness = {}

    # Define the directory to save the stats and CSV files
    output_directory = os.path.join(prediction_metrics_directory, stats_directory_name)
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each DataFrame in dataframes_clean_data
    for key, df in dataframes_clean_data.items():
        # Check if it is the baseline dataframe
        if "baseline" in key:
            # Split baseline data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                baseline_data_by_missingness[str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]
        else:
            # Initialize a dictionary for each feature selection dataframe
            data_by_missingrates[key] = {}
            # Split feature selection data by missing rate
            for missing_rate in df['measure_and_missing_rates'].unique():
                data_by_missingrates[key][str(missing_rate)] = df[df['measure_and_missing_rates'] == str(missing_rate)]

    # Iterate over each imputation+prediction model combination to create combined CSVs
    for imputation_prediction_model in baseline_data_by_missingness[list(baseline_data_by_missingness.keys())[0]].columns[1:]:
        # Prepare a DataFrame to store performance values for each missing rate
        values_data = {'Missing Rates': ['10%', '20%', '30%', '40%', '50%']}
        
        # Extract and calculate values for baseline pipeline
        baseline_values = []
        for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
            if missing_rate in baseline_data_by_missingness:
                baseline_value = baseline_data_by_missingness[missing_rate][imputation_prediction_model].mean()
                values_data['Baseline (' + imputation_prediction_model + ')'] = [baseline_value] if missing_rate == '0.1' else values_data['Baseline (' + imputation_prediction_model + ')'] + [baseline_value]
                baseline_values.append(baseline_value)

        # Extract and calculate values for each feature selection pipeline
        fs_values_dict = {}
        for fs_key, fs_dict in data_by_missingrates.items():
            fs_values = []
            for missing_rate in ['0.1', '0.2', '0.3', '0.4', '0.5']:
                if missing_rate in fs_dict and imputation_prediction_model in fs_dict[missing_rate].columns:
                    fs_value = fs_dict[missing_rate][imputation_prediction_model].mean()
                    if fs_key not in values_data:
                        values_data[fs_key] = [fs_value]
                    else:
                        values_data[fs_key].append(fs_value)
                    fs_values.append(fs_value)
                else:
                    fs_values.append(None)
            fs_values_dict[fs_key] = fs_values

        # Convert values_data to DataFrame
        values_df = pd.DataFrame(values_data)

        # Prepare a DataFrame for descriptive statistics
        stats_data = {'Missing Rates': ['Mean', 'Std', 'P-Value']}
        
        # Calculate descriptive stats for the baseline
        baseline_mean = pd.Series(baseline_values).mean()
        baseline_std = pd.Series(baseline_values).std()
        
        stats_data['Baseline (' + imputation_prediction_model + ')'] = [baseline_mean, baseline_std, '']  # P-Value not needed for baseline

        # Calculate descriptive stats and p-values for each feature selection pipeline
        for fs_key, fs_values in fs_values_dict.items():
            fs_mean = pd.Series(fs_values).mean()
            fs_std = pd.Series(fs_values).std()
            # Perform a paired t-test between the baseline and the feature selection pipeline
            t_stat, p_value = stats.ttest_rel(baseline_values, [v for v in fs_values if v is not None])

            # Store stats in the stats_data dictionary
            stats_data[fs_key] = [fs_mean, fs_std, p_value]

        # Convert stats_data to DataFrame
        stats_df = pd.DataFrame(stats_data)

        # Combine both DataFrames into one with an empty row in between
        combined_df = pd.concat([values_df, pd.DataFrame([[''] * len(values_df.columns)], columns=values_df.columns), stats_df], ignore_index=True)

        # Save combined DataFrame to CSV
        output_file = os.path.join(output_directory, f'{dataset_name}_{imputation_prediction_model}_combined_stats.csv')
        combined_df.to_csv(output_file, index=False)
        print(f'Statistics and comparison results saved in {output_file}')



In [6]:


directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#33_MAR_Test_Cleveland_10_2_24_imp_pred_eval_10_trials_10_fold_base_fs_pipelines/MAR"]
# measures=["roc_auc","accuracy","f1_score"]
measures=["accuracy"]


# directory_list=["/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MCAR",
#                 "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#29_prelimdata_Cleveland_9_22_24_imp_pred_eval_3_trials_base_fs_pipelines/MNAR"

#                 ]


for dir in directory_list:
    for measure in measures:
        # Initialize the dictionary to store DataFrames
        dataframes = {}

        # Define the measure type you want to drop (e.g., "MAE", "RMSE")
        #options: "auc_roc","accuracy","f1_score"
        # measure_to_drop = ["auc_roc","accuracy"]  # Change this value to drop a different measure type
        # Initialize a new dictionary to store DataFrames with specific rows removed
        dataframes_clean_data = {}





        # MISSING MECHANISM Experiment Directory
        prediction_metrics_directory = dir
       
        y_axis_label=measure
        missing_mechanism=os.path.basename(prediction_metrics_directory)

        #Since we are using the same directory per dataset, we want to look at different files, the two files are shown below
        #imputation_eval_final_results or prediction_metrics_final_results
        experiment_type_name="per_fold_missingness"
        #name of the directory where the stats will be saved
        generic_stats_directory_name="stats_prediction_level"
        complete_stats_directory_name=f"{generic_stats_directory_name}/{measure}_complete_{generic_stats_directory_name}"
        stats_directory_name=f"{generic_stats_directory_name}/{measure}_{generic_stats_directory_name}"

        # Extract the directory name before MAR or MCAR or MNAR
        parent_directory = os.path.basename(os.path.dirname(prediction_metrics_directory))
        dataset_name = parent_directory.split('_')[1]  # Assuming "Cleveland" is always the second part


        dataframes=load_experiment_data(dataframes,prediction_metrics_directory,experiment_type_name)
        dataframes_clean_data=clean_data(dataframes, dataframes_clean_data, y_axis_label)

        data_by_missingrates, baseline_data_by_missingness=visualize_FS_imputation_pred(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name)
        compare_feature_selection_across_pipelines_with_baseline(data_by_missingrates,prediction_metrics_directory, baseline_data_by_missingness, stats_directory_name)
        
        generate_combined_csv_per_graph(
        dataset_name,
        prediction_metrics_directory,
        dataframes_clean_data,
        stats_directory_name)


        
        

        
        
        





        # This is an alternative to specifically set the y_min and y_max values
        # visualize_FS_imputation_pred(dataset_name,prediction_metrics_directory, dataframes_clean_data,missing_mechanism,y_axis_label,stats_directory_name, 0.6,1)
        # Uncomment this in the function
        # plt.ylim(y_min, y_max)

/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#33_MAR_Test_Cleveland_10_2_24_imp_pred_eval_10_trials_10_fold_base_fs_pipelines/MAR/MAR_Experiment_2024-10-02_18-22-13.390822/baseline_pipeline/MAR/per_fold_results/baseline_per_fold_missingness_0.5/baseline_per_fold_missingness_fold_9_metrics.csv
/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#33_MAR_Test_Cleveland_10_2_24_imp_pred_eval_10_trials_10_fold_base_fs_pipelines/MAR/MAR_Experiment_2024-10-02_18-22-13.390822/baseline_pipeline/MAR/per_fold_results/baseline_per_fold_missingness_0.5/baseline_per_fold_missingness_fold_5_metrics.csv
/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/Dyla

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['measure_and_missing_rates'] = df_clean['measure_and_missing_rates'].apply(lambda x: x.split('_')[-1] if isinstance(x, str) else x)


Plots saved in /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#33_MAR_Test_Cleveland_10_2_24_imp_pred_eval_10_trials_10_fold_base_fs_pipelines/MAR/stats_prediction_level/accuracy_stats_prediction_level
P-values (including baseline) saved to /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#33_MAR_Test_Cleveland_10_2_24_imp_pred_eval_10_trials_10_fold_base_fs_pipelines/MAR/stats_prediction_level/accuracy_stats_prediction_level/feature_selection_comparisons_with_baseline_p_values.csv
Statistics and comparison results saved in /Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 

In [7]:
baseline_data_by_missingness["0.1"]

Unnamed: 0,measure_and_missing_rates,KNN-Imputer_SV-Classifier,KNN-Imputer_RF-Classifier,KNN-Imputer_XGB-Classifier,Mean-Imputer_SV-Classifier,Mean-Imputer_RF-Classifier,Mean-Imputer_XGB-Classifier,RF-Imputer_SV-Classifier,RF-Imputer_RF-Classifier,RF-Imputer_XGB-Classifier
748,0.1,0.8000,0.9000,0.8000,0.8333,0.7667,0.7333,0.8000,0.8000,0.8333
586,0.1,0.9310,0.8276,0.8276,0.8276,0.7931,0.8621,0.8966,0.7241,0.7931
589,0.1,0.7000,0.7000,0.6333,0.7000,0.7333,0.6667,0.7000,0.7333,0.7333
592,0.1,0.7333,0.7667,0.7667,0.7000,0.7333,0.7667,0.7333,0.8000,0.7667
595,0.1,0.9310,0.9310,0.8276,0.8966,0.9310,0.8966,0.9310,0.9310,0.8966
...,...,...,...,...,...,...,...,...,...,...
139,0.1,0.7000,0.7333,0.7333,0.7667,0.7000,0.7333,0.7333,0.7333,0.7000
142,0.1,0.8000,0.8000,0.7667,0.8000,0.8000,0.7333,0.8000,0.8333,0.7667
145,0.1,0.8621,0.8621,0.8276,0.8621,0.9310,0.8966,0.8966,0.8966,0.8966
121,0.1,0.8966,0.7931,0.8621,0.8966,0.8621,0.7586,0.8621,0.8966,0.8621


In [8]:
# import os
# import re

# def rename_files_in_directory(directory):
#     # Regular expression to match the file pattern fold_N_per_fold_metrics_.csv
#     pattern = r"fold_(\d+)_per_fold_metrics_\.csv"
    
#     # os.walk will walk through all subdirectories as well
#     for root, dirs, files in os.walk(directory):
#         for filename in files:
#             # Check if the filename matches the required pattern
#             match = re.match(pattern, filename)
#             if match:
#                 # Extract the number N from the filename
#                 N = match.group(1)
                
#                 # Construct the new filename
#                 new_filename = f"baseline_per_fold_missingness_fold_{N}_metrics.csv"
                
#                 # Get the full path for the current and new file names
#                 old_filepath = os.path.join(root, filename)
#                 new_filepath = os.path.join(root, new_filename)
                
#                 # Rename the file
#                 os.rename(old_filepath, new_filepath)
#                 print(f"Renamed: {old_filepath} -> {new_filepath}")




# # Provide the directory where the files are located
# directory = "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#32_prelimdata_Cleveland_10_1_24_imp_pred_eval_2_trials_10_fold_base_fs_pipelines/MAR/MAR_Experiment_2024-09-30_01-50-28.442560/baseline_pipeline/MAR/per_fold_results"
# rename_files_in_directory(directory)


In [9]:
# import os
# import re

# def rename_files_in_folders(directory):
#     # Regular expression to match the file pattern fold_N_per_fold_metrics_.csv
#     file_pattern = r"fold_(\d+)_per_fold_metrics_\.csv"
    
#     # Walk through the main directory and all its subdirectories
#     for root, dirs, files in os.walk(directory):
#         for folder_name in dirs:
#             # Extract the feature selection method from the folder name (before 'per_fold_missingness')
#             feature_selection = folder_name.split('_per_fold_missingness')[0]
#             folder_path = os.path.join(root, folder_name)
            
#             # Now we walk through each folder to find files matching the pattern
#             for sub_root, _, sub_files in os.walk(folder_path):
#                 for filename in sub_files:
#                     # Match the file name using the regex pattern
#                     match = re.match(file_pattern, filename)
#                     if match:
#                         # Extract the number N from the filename
#                         N = match.group(1)
                        
#                         # Construct the new filename
#                         new_filename = f"{feature_selection}_per_fold_missingness_fold_{N}_metrics.csv"
                        
#                         # Get the full path for the current and new file names
#                         old_filepath = os.path.join(sub_root, filename)
#                         new_filepath = os.path.join(sub_root, new_filename)
                        
#                         # Rename the file
#                         os.rename(old_filepath, new_filepath)
#                         print(f"Renamed: {old_filepath} -> {new_filepath}")

# # Provide the main directory where all the feature selection folders are located
# main_directory = "/Users/dylandominguez/Library/CloudStorage/GoogleDrive-domy7912@gmail.com/My Drive/Grad School/Thesis/DylanDominguez-S24-F24-MastersThesis_Shared(Old -DATA ONLY))/Part 2 - Feature Selection Data/Exp#32_prelimdata_Cleveland_10_1_24_imp_pred_eval_2_trials_10_fold_base_fs_pipelines/MAR/MAR_Experiment_2024-09-30_04-00-23.013363/fs_pipeline/MAR/per_fold_results"
# rename_files_in_folders(main_directory)
