In [1]:
import pandas as pd
import numpy as np
import os
import ast
import re 

# Create EXCEL

In [2]:
def get_filenames_before_extension(directory):
    # List all files in the directory
    files = os.listdir(directory)
    
    # Filter out only CSV files and extract names before the extension
    file_roots = [os.path.splitext(file)[0] for file in files if file.endswith('.csv')]
    
    return file_roots

In [3]:
result_dir = 'Results/'
data_dir = 'data/'

dataset_names = get_filenames_before_extension(data_dir)

# Define parameters
methods = ['gso_1', 'gso_2', 'validation_score', 'one_sem','one_sem_grd']


In [4]:
def parse_list(s):
    try:
        # Try to parse as a Python list first
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        pass
    
    try:
        # Handle space-separated values within brackets
        s = re.sub(r'\s+', ' ', s.strip())
        s = s.replace('[ ', '[').replace(' ]', ']')
        s = s.replace(' ', ',')
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        pass
    
    return s

In [5]:
def concatenate_csv_files(directory, dataset_name, methods, output_excel):
    # List all files in the directory
    files = os.listdir(directory)
    
    # Filter for the relevant CSV files
    csv_files = [f for f in files if f.endswith('.csv') and dataset_name in f]
    
    # Initialize an empty list to hold DataFrames
    df_list = []
    
    # Loop over each CSV file
    for csv_file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(directory, csv_file))
        
        # Convert string representations of lists back to actual lists for specified columns
        for col in df.columns:
            if df[col].dtype == 'object' and df[col].str.startswith('[').any():
                df[col] = df[col].apply(parse_list)
        
        # Append the DataFrame to the list
        df_list.append(df)
    
    # Concatenate all DataFrames in the list
    concatenated_df = pd.concat(df_list, ignore_index=True)
    
    # Save the concatenated DataFrame to an Excel file
    concatenated_df.to_excel(output_excel, index=False)
    print(f'Successfully created {output_excel}')

In [6]:
dataset_names =['chronic_fatigue','epic_ce_ms','epic_lc_ms_neg','epic_lc_ms_pos','gastric_cancer','periodontal_inflammation']

for dataset_name in dataset_names:
    output_excel = f'{result_dir}{dataset_name}_concat_results.xlsx'
    concatenate_csv_files(result_dir, dataset_name, methods, output_excel)


Successfully created Results/chronic_fatigue_concat_results.xlsx
Successfully created Results/epic_ce_ms_concat_results.xlsx
Successfully created Results/epic_lc_ms_neg_concat_results.xlsx
Successfully created Results/epic_lc_ms_pos_concat_results.xlsx
Successfully created Results/gastric_cancer_concat_results.xlsx
Successfully created Results/periodontal_inflammation_concat_results.xlsx


# Add Inner Selection Everywhere 

In [7]:
def update_csv_files_with_inner_selection(directory, dataset_names, methods):
    # Loop through each dataset name
    for dataset_name in dataset_names:
        # List all files in the directory
        files = os.listdir(directory)
        
        # Filter for the relevant CSV files
        csv_files = [f for f in files if f.endswith('.csv') and dataset_name in f]
        
        # Loop over each CSV file
        for csv_file in csv_files:
            # Determine the method from the file name
            method = None
            for m in methods:
                if m in csv_file:
                    method = m
                    break
            
            if method is None:
                continue
            
            # Read the CSV file into a DataFrame
            file_path = os.path.join(directory, csv_file)
            df = pd.read_csv(file_path)
            
            # Check if 'Inner_Selection' column exists, if not, create it
            if 'Inner_Selection' not in df.columns:
                df['Inner_Selection'] = method
            
            # Save the DataFrame back to the same CSV file
            df.to_csv(file_path, index=False)
            print(f'Updated {csv_file} with Inner_Selection = {method}')


In [8]:
# Define parameters
directory = 'Results/'  
dataset_names = ['chronic_fatigue', 'epic_ce_ms', 'epic_lc_ms_neg', 'epic_lc_ms_pos', 'gastric_cancer', 'periodontal_inflammation']
methods = ['gso_1', 'gso_2', 'validation_score','one_sem_grd', 'one_sem']

# Call the function
update_csv_files_with_inner_selection(directory, dataset_names, methods)

Updated chronic_fatigue_one_sem_grd_all_features_outerloops_results.csv with Inner_Selection = one_sem_grd
Updated chronic_fatigue_validation_score_all_features_outerloops_results.csv with Inner_Selection = validation_score
Updated chronic_fatigue_one_sem_all_features_outerloops_results.csv with Inner_Selection = one_sem
Updated chronic_fatigue_gso_1_all_features_outerloops_results.csv with Inner_Selection = gso_1
Updated chronic_fatigue_gso_2_all_features_outerloops_results.csv with Inner_Selection = gso_2
Updated epic_ce_ms_validation_score_all_features_outerloops_results.csv with Inner_Selection = validation_score
Updated epic_ce_ms_one_sem_grd_all_features_outerloops_results.csv with Inner_Selection = one_sem_grd
Updated epic_ce_ms_gso_2_all_features_outerloops_results.csv with Inner_Selection = gso_2
Updated epic_ce_ms_one_sem_all_features_outerloops_results.csv with Inner_Selection = one_sem
Updated epic_ce_ms_gso_1_all_features_outerloops_results.csv with Inner_Selection = gso_1