## Read all the files of the same student and create a dataset that is averaged by the sample rate size.
That means the averaged dataset will have the same number of lines and each line will be an average of each second

In [29]:
#create a function to average the lines in a dataframe by sample rate to create a dataframe with the same number of lines as the other dataframes

def create_averaged_dataset (df, sample_rate):
    for i in range(0, len(df.columns), sample_rate):
        df = df.groupby(np.arange(len(df))//sample_rate).mean()

    return df

In [30]:
#create a function to concatenate the dataframes vertically

def concat_dataframes(dfs, df, folder_name):
    dfs[folder_name] = pd.concat([dfs[folder_name], df], axis=1)
    return dfs[folder_name]

In [39]:
#import the libraries
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

def create_dicts_for_students(parent_folder):

    # Specify the parent folder containing subfolders
    #parent_folder = './data/S1/'

    # Initialize an empty dictionary to store DataFrames
    dfs = {}

    # Initialize an empty dictionary to store the averaged DataFrames
    dfs_averaged = {}

    # Iterate over subfolders in the parent folder
    for folder_name in tqdm(os.listdir(parent_folder)):
        folder_path = os.path.join(parent_folder, folder_name)
        #control if metadata about the student and the exam are present in the dataframe averaged
        #student_metadata = False

        # Check if the item in the parent folder is a subfolder
        if os.path.isdir(folder_path):
            
            # Create an empty DataFrame with the same name as the subfolder
            dfs[folder_name] = pd.DataFrame()

            dfs_averaged[folder_name] = pd.DataFrame()

            # Iterate through each file in the folder
            for filename in os.listdir(folder_path):

                # Check if the file is a CSV file and that it is not empty
                if filename.endswith('.csv') and os.path.getsize(os.path.join(folder_path, filename)) > 0:
                    # Construct the full file path
                    file_path = os.path.join(folder_path, filename)
                
                    # Read the CSV file into a DataFrame
                    df = pd.read_csv(file_path, header=None)
                    
                    # Extract the DataFrame name (without the ".csv" extension)
                    df_name = os.path.splitext(filename)[0]

                    # Rename the columns of the DataFrame using the file name
                    df.columns = [df_name+'_'+str((i+1)) for i in range(len(df.columns))]

                    # Call the functions to create a new dataframe that do the average fo the lines by sample rate
                
                    # IBI and tag are the only csv file that do not have data collected in a specified sample rate
                    if not (df_name.startswith('IBI') or df_name.startswith('tag')): 
                        #the sample rate is found in the second line of the dataframe and for some datasets it is a list of same values
                        #so we take the first value of the list
                        df_averaged = create_averaged_dataset(df.loc[2:], int(df.iloc[1].values[0]))

                        # create a dataset that contains the average of the lines by quartiles, using the first column to define a quartile
                        #df_averaged = create_averaged_dataset(df.loc[2:], len(df)//4)

                        #if not student_metadata:
                        #    df_averaged['exam'] =  folder_name
                            #take only the last string of the parent folder name to define the student
                        #    df_averaged['student'] = parent_folder[-3:-1]
                        #    student_metadata = True

                        #concat the new averaged dataframe with the previous ones
                        dfs_averaged[folder_name] = concat_dataframes(dfs_averaged, df_averaged, folder_name)

                    # Concat two DataFrames vertically and save the result in a dictionary of dataframes
                    dfs[folder_name] = concat_dataframes(dfs, df, folder_name)

    return dfs, dfs_averaged

In [40]:
#Iterate over all the ten students
for i in range(1,11):
    #create the path for each student
    path = './data/S'+str(i)+'/'
    #create the dictionaries for each student
    dfs, dfs_averaged = create_dicts_for_students(path)
    
    #define the output directory
    outdir = './data/dataframes/S'+str(i)+'/'

    #save the dictionaries in many csv files
    for key in dfs.keys():
        #if the directory does not exist, create it
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        #save the dataframes in csv files
        dfs[key].to_csv('./data/dataframes/S'+str(i)+'/S'+str(i)+'_'+key+'.csv', index=False)
        dfs_averaged[key].to_csv('./data/dataframes/S'+str(i)+'/S'+str(i)+'_'+key+'_averaged_quartile.csv', index=False)

100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
100%|██████████| 4/4 [00:09<00:00,  2.40s/it]
100%|██████████| 4/4 [00:04<00:00,  1.24s/it]
100%|██████████| 4/4 [00:04<00:00,  1.20s/it]
100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
100%|██████████| 4/4 [00:05<00:00,  1.32s/it]
100%|██████████| 4/4 [00:08<00:00,  2.19s/it]
100%|██████████| 4/4 [00:04<00:00,  1.19s/it]
100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
100%|██████████| 4/4 [00:04<00:00,  1.18s/it]
