In [110]:
import pandas as pd
import glob

class data_analysis:

    def __init__(self, directory):

        # Directory where all CSV files are located
        self.directory = "../csv_files/"+directory+"/"
        
        self.df = {}
        self.grouped_df = {}
        
        # Create experiments info dataframe
        info_file = self.directory + "info.csv"
        self.df['info'] = pd.read_csv(info_file)

        epoch_files = self.directory + "epoch_step_info/*epoch.csv"
        step_files = self.directory + "epoch_step_info/*step.csv"

        # Combine all files for steps and epochs in two dataframes
        self.df['epoch'], self.df['step'] = self.concat_epoch_step_files(epoch_files, step_files)

        # Add for each step and epoch dataframe row all the experiment info
        self.epoch_step_with_info()

    # Combine all files for steps and epochs in two dataframes
    def concat_epoch_step_files(self, epoch_files, step_files):
        epoch_files_list = glob.glob(epoch_files)
        step_files_list = glob.glob(step_files)

        epoch_dfs, step_dfs = [], []

        for epoch_file, step_file in zip(epoch_files_list, step_files_list):
            
            epoch_file_df = pd.read_csv(epoch_file)
            step_file_df = pd.read_csv(step_file)
            
            epoch_dfs.append(epoch_file_df)
            step_dfs.append(step_file_df)    

        epoch_df = pd.concat(epoch_dfs, ignore_index=True)
        step_df = pd.concat(step_dfs, ignore_index=True)

        return epoch_df, step_df
    
    # Add for each step and epoch dataframe row all the experiment info
    def epoch_step_with_info(self):

        self.df['epoch'] = pd.merge(self.df['info'], self.df['epoch'], on='exper_id')
        self.df['step'] = pd.merge(self.df['info'], self.df['step'], on='exper_id')

    # Select from dataframes the rows that have specific values in columns
    def select_where(self, selections):
        columns = list(selections.keys())
        values = list(selections.values())
        threshold_in_columns = False

        # Check if 'threshold' is in the selections
        if 'threshold' in columns:
            threshold_in_columns = True
            threshold_index = columns.index('threshold')
            threshold_value = values.pop(threshold_index)
            columns.remove('threshold')
        
        for key, df in self.df.items():
            conditions = [df[column] == value for column, value in zip(columns, values)]
            
            if threshold_in_columns:
                threshold_condition = (df['threshold'] == threshold_value) | (df['threshold'].isna())
                conditions.append(threshold_condition)

            final_condition = conditions[0]

            for condition in conditions[1:]:
                final_condition &= condition
            self.df[key] = df[final_condition]

    def group_repeated_expers(self, attributes, key, aggr, time = False):
        # Group by multiple columns
        grouped_df = self.df[key].groupby(attributes, dropna=False)
        if time:
            resulting_df = grouped_df.agg({aggr: ['min', 'mean', 'max'], 'time': 'mean'}).reset_index()
            resulting_df.columns = [f"{col[0]}_{col[1]}" if col[1] else col[0] for col in resulting_df.columns]
        else:
            resulting_df = grouped_df['time'].agg(['min', 'mean', 'max']).reset_index()
        return resulting_df

In [111]:
da = data_analysis()
da.select_where({'batch_size': 128, 'threshold': 50})
da.df['info']

Unnamed: 0,exper_id,algorithm,model,clients,clients_distr,epochs,threshold,batch_size,syncs,duration,timestamp,epoch_steps
0,0,synchronous,adv_cnn,16,"[2, 2, 2, 2, 2, 2, 2, 2]",100,,128,2922,3167.576719,23-11-2023 17:02,29
1,1,synchronous,adv_cnn,8,"[2, 2, 2, 2]",100,,128,5837,4965.47203,23-11-2023 17:01,58
2,2,synchronous,adv_cnn,4,"[2, 2]",100,,128,11715,7208.223564,23-11-2023 17:01,117
5,6,naive,adv_cnn,16,"[2, 2, 2, 2, 2, 2, 2, 2]",100,50.0,128,34,1431.400897,23-11-2023 20:17,29
12,13,naive,adv_cnn,8,"[2, 2, 2, 2]",100,50.0,128,69,2806.437517,23-11-2023 21:54,58
18,20,linear,adv_cnn,16,"[2, 2, 2, 2, 2, 2, 2, 2]",100,50.0,128,32,1492.374927,23-11-2023 23:58,29
21,23,naive,adv_cnn,4,"[2, 2]",100,50.0,128,148,5521.489052,23-11-2023 23:45,117
33,36,sketch,adv_cnn,16,"[2, 2, 2, 2, 2, 2, 2, 2]",100,50.0,128,32,3833.765501,24-11-2023 03:44,29
34,37,linear,adv_cnn,8,"[2, 2, 2, 2]",100,50.0,128,70,2893.813023,24-11-2023 04:19,58
51,57,sketch,adv_cnn,8,"[2, 2, 2, 2]",100,50.0,128,60,7585.523808,24-11-2023 10:56,58


In [None]:
resulting_df

In [None]:
#grouped_df = da.epoch_df.groupby('exper_id')['accuracy'].max().reset_index()
#grouped_df = pd.merge(da.info_df, grouped_df, on='exper_id')
#info_df = da.info_df