### Basic importing of modules, clever loop for importing all raw data frames efficiently

In [40]:
import glob
import os
import pandas as pd

folder_paths = ["Controls", "LC", "ME"]

# Dictionaries to hold DataFrames and headers for each folder
all_dfs = {}
all_headers = {}

for path in folder_paths:
    csv_files = glob.glob(os.path.join(path, '*.xlsx'))
    print(csv_files)
    # Read data without headers, skipping the first 3 rows
    dfs = [pd.read_excel(file, header=None, skiprows=3) for file in csv_files]
    
    # Read the header row separately (the row just before the data starts, so skiprows=2)
    headers = [pd.read_excel(file, nrows=1, header=None) for file in csv_files]
    
    # Assign the header row to each DataFrame
    for df, header in zip(dfs, headers):
        df.columns = header.iloc[0]
        
    # Store the DataFrames and headers in the dictionaries
    all_dfs[path] = dfs
    all_headers[path] = headers

# 25 seconds

['Controls\\P110061.xlsx', 'Controls\\P110090.xlsx', 'Controls\\P110096.xlsx', 'Controls\\P110097.xlsx', 'Controls\\P110098.xlsx', 'Controls\\P110099.xlsx', 'Controls\\P110101.xlsx', 'Controls\\P110102.xlsx', 'Controls\\P110103.xlsx', 'Controls\\P110104.xlsx', 'Controls\\P110106.xlsx', 'Controls\\P110107.xlsx', 'Controls\\P110109.xlsx', 'Controls\\P110110.xlsx', 'Controls\\P110112.xlsx', 'Controls\\P110114.xlsx', 'Controls\\P110115.xlsx', 'Controls\\P110126_P110126_CPET.xlsx', 'Controls\\P110132_P110132_CPET.xlsx', 'Controls\\P110139_P110139_CPET.xlsx', 'Controls\\P110140_110140_CPET.xlsx', 'Controls\\P110148_P110148_CPET.xlsx', 'Controls\\P110149_P110149_CPET.xlsx', 'Controls\\P110150_P110150_CPET.xlsx', 'Controls\\P110151_P110151_CPET.xlsx', 'Controls\\P110152_P110152_CPET.xlsx']
['LC\\P110017.xlsx', 'LC\\P110039.xlsx', 'LC\\P110050.xlsx', 'LC\\P110051.xlsx', 'LC\\P110055.xlsx', 'LC\\P110056.xlsx', 'LC\\P110058.xlsx', 'LC\\P110064.xlsx', 'LC\\P110085.xlsx', 'LC\\P110086.xlsx', 'LC\\P

### Create a copy of the grouped data frame to work with, to avoid having to import all the data frames over and over again

In [41]:
all_dfs_c = {'Controls':[], 'LC':[], 'ME':[]}
groups = ['Controls', 'LC', 'ME']
for group in groups:
    group_list = []
    for df in all_dfs[group]:
        group_list.append(df.copy())
    all_dfs_c[group] = group_list

### Count the amount of dataframes with missing columns_to_keep, and identify missing columns

In [42]:
cols_to_use = ['t', 'Power', 'HR', 'VE', 'VO2', 'VCO2', 'PetCO2', 'PetO2', 'VO2/Kg', 'VE/VO2', 'VE/VCO2', 'RQ', 'VT', 'Rf', 'Ti', 'Te', 'Phase']
count = {col: 0 for col in cols_to_use}
no_count = {col: 0 for col in cols_to_use}
for group_name, group in all_dfs_c.items():
    for df in group:
        for col in cols_to_use:
            if col in df.columns:
                count[col] += 1
            else:
                no_count[col] += 1
print(count)
print(no_count)
for i in no_count:
    if no_count[i] > 0:
        print(i)
        print(no_count[i])

{'t': 63, 'Power': 63, 'HR': 60, 'VE': 63, 'VO2': 63, 'VCO2': 63, 'PetCO2': 63, 'PetO2': 63, 'VO2/Kg': 63, 'VE/VO2': 63, 'VE/VCO2': 63, 'RQ': 63, 'VT': 63, 'Rf': 63, 'Ti': 63, 'Te': 63, 'Phase': 63}
{'t': 0, 'Power': 0, 'HR': 3, 'VE': 0, 'VO2': 0, 'VCO2': 0, 'PetCO2': 0, 'PetO2': 0, 'VO2/Kg': 0, 'VE/VO2': 0, 'VE/VCO2': 0, 'RQ': 0, 'VT': 0, 'Rf': 0, 'Ti': 0, 'Te': 0, 'Phase': 0}
HR
3


### Remove unwanted columns


In [43]:
for group_name, group in all_dfs_c.items():
    for df in group:
        col_rem_2b = [col for col in df.columns if col not in cols_to_use]
        df.drop(columns = col_rem_2b, inplace = True)

###    Identify NaN's per group, per dataframe, per column

In [44]:
# Loop through each group in all_dfs
for group_name, group in all_dfs_c.items():    
    # Check for NaNs and print the summary
    nan_checks = {}
    for i, df in enumerate(group):
        nan_checks[i] = {col: df[col].isna().sum() for col in df.columns}

    # Print the NaN checks
    print(f"NaN checks for group '{group_name}':")
    for df_index, cols in nan_checks.items():
        print(f"DataFrame {df_index}:")
        for col, nan_count in cols.items():
            if nan_count > 0:
                print(f"  {col}: {nan_count} NaNs")
# 0.5 seconds

NaN checks for group 'Controls':
DataFrame 0:
DataFrame 1:
DataFrame 2:
  Power: 1 NaNs
DataFrame 3:
DataFrame 4:
DataFrame 5:
DataFrame 6:
DataFrame 7:
DataFrame 8:
DataFrame 9:
DataFrame 10:
DataFrame 11:
DataFrame 12:
DataFrame 13:
DataFrame 14:
DataFrame 15:
  HR: 1 NaNs
DataFrame 16:
DataFrame 17:
  Power: 1 NaNs
DataFrame 18:
DataFrame 19:
DataFrame 20:
  HR: 1 NaNs
DataFrame 21:
DataFrame 22:
DataFrame 23:
DataFrame 24:
DataFrame 25:
NaN checks for group 'LC':
DataFrame 0:
DataFrame 1:
DataFrame 2:
DataFrame 3:
DataFrame 4:
DataFrame 5:
DataFrame 6:
  HR: 1 NaNs
DataFrame 7:
DataFrame 8:
DataFrame 9:
DataFrame 10:
NaN checks for group 'ME':
DataFrame 0:
DataFrame 1:
DataFrame 2:
DataFrame 3:
DataFrame 4:
DataFrame 5:
DataFrame 6:
DataFrame 7:
DataFrame 8:
  HR: 2 NaNs
DataFrame 9:
DataFrame 10:
DataFrame 11:
DataFrame 12:
DataFrame 13:
  Power: 1 NaNs
DataFrame 14:
DataFrame 15:
DataFrame 16:
DataFrame 17:
DataFrame 18:
DataFrame 19:
  HR: 2 NaNs
DataFrame 20:
DataFrame 21:
  HR

In [45]:
def hhmmss_to_seconds(hhmmss):
    h, m, s = map(int, hhmmss.split(':'))
    return h * 3600 + m * 60 + s

In [46]:
for group_name, group in all_dfs_c.items():
    for df in group:
        df.dropna(inplace=True)
Control = all_dfs_c['Controls']
LC = all_dfs_c['LC']
ME = all_dfs_c['ME']

for groupname, group in all_dfs_c.items():
    for i,df in enumerate(group):
        df['Participant'] = f'{groupname}_{i}'

In [47]:
Control_df = pd.concat((Control), axis = 0)
LC_df = pd.concat((LC), axis = 0)
ME_df = pd.concat((ME), axis = 0)



col_order = ['t', 'Power', 'HR', 'VE', 'VO2', 'VCO2', 'PetCO2', 'PetO2', 'VO2/Kg', 'VE/VO2', 'VE/VCO2', 'RQ', 'VT', 'Rf', 'Ti', 'Te', 'Participant', 'Phase']
Control_df = Control_df[col_order]
LC_df = LC_df[col_order]
ME_df = ME_df[col_order]

CHANGING DATE STRING TO SECONDS

In [48]:
for df in [Control_df, LC_df, ME_df]:
    df['t'] = df['t'].astype('string')
    df['t'] = df['t'].apply(hhmmss_to_seconds)


In [49]:
for group_name, df_group in zip(['Controls', 'LC', 'ME'], [Control_df, LC_df, ME_df]):     
    df_group.to_csv(f'{group_name}.data', sep=' ', index = False)

NOTES:
-Heart rate values need to be cleaned