In [11]:
import pandas as pd
from pathlib import Path

In [12]:
INPUT_DIR = Path('../output/analysis-1024')
OUTPUT_DIR = '../output/results'
DEVICE = "gpu" # "gpu" or "cpu"

In [13]:
# Metrics functions
def get_min_max_mean(df, column, start_time, end_time):
    df_i = df[(df['TIMESTAMP'] >= start_time) & (df['TIMESTAMP'] <= end_time)]
    min_value = df_i[column].min()
    max_value = df_i[column].max()
    mean_value = df_i[column].mean()
    return min_value, max_value, mean_value
  
def add_min_max_mean_columns(df, column, index, min_value, max_value, mean_value):
    df.loc[index, f'{column}_MIN'] = min_value
    df.loc[index, f'{column}_MAX'] = max_value
    df.loc[index, f'{column}_MEAN'] = mean_value

def sumarise_df_cpu(df_cpu, df_mem_cpu, df_timestamp):
    df_cpu['TIMESTAMP'] = df_cpu['TIMESTAMP'].astype(int)
    df_mem_cpu['TIMESTAMP'] = df_mem_cpu['TIMESTAMP'].astype(int)
    df_timestamp['START_TIME'] = df_timestamp['START_TIME'].astype(str).str[:10].astype(int)
    df_timestamp['END_TIME'] = df_timestamp['END_TIME'].astype(str).str[:10].astype(int)
  
    for i, r in df_timestamp.iterrows():
        start_time = r['START_TIME']
        end_time = r['END_TIME']
        
        min_value, max_value, mean_value = get_min_max_mean(df_cpu, ' CPU', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'CPU', i, min_value, max_value, mean_value)
        
        min_value, max_value, mean_value = get_min_max_mean(df_mem_cpu, ' USED', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'MEM_CPU', i, min_value, max_value, mean_value)
                
    return df_timestamp
  
def sumarise_df_gpu(df_cpu, df_mem_cpu, df_gpu, df_mem_gpu, df_timestamp):
    df_cpu['TIMESTAMP'] = df_cpu['TIMESTAMP'].fillna(0).astype(int)
    df_mem_cpu['TIMESTAMP'] = df_mem_cpu['TIMESTAMP'].fillna(0).astype(int)
    df_gpu['TIMESTAMP'] = df_gpu['TIMESTAMP'].fillna(0).astype(int)
    df_mem_gpu['TIMESTAMP'] = df_mem_gpu['TIMESTAMP'].fillna(0).astype(int)
    df_timestamp['START_TIME'] = df_timestamp['START_TIME'].fillna(0).astype(str).str[:10].astype(int)
    df_timestamp['END_TIME'] = df_timestamp['END_TIME'].fillna(0).astype(str).str[:10].astype(int)

    for i, r in df_timestamp.iterrows():
        start_time = r['START_TIME']
        end_time = r['END_TIME']
        
        min_value, max_value, mean_value = get_min_max_mean(df_cpu, ' CPU', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'CPU', i, min_value, max_value, mean_value)
        
        min_value, max_value, mean_value = get_min_max_mean(df_mem_cpu, ' USED', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'MEM_CPU', i, min_value, max_value, mean_value)

        min_value, max_value, mean_value = get_min_max_mean(df_gpu, ' GPU', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'GPU', i, min_value, max_value, mean_value)
        
        min_value, max_value, mean_value = get_min_max_mean(df_mem_gpu, ' USED', start_time, end_time)
        add_min_max_mean_columns(df_timestamp, 'MEM_GPU', i, min_value, max_value, mean_value)
        
    return df_timestamp


In [14]:
for dir_path in INPUT_DIR.rglob('**/experiment*'):
    dir, output, threads, experiment = str(dir_path).split('/')
    threads = threads.split('-')[-1] 
    try:
        df_timestamp = pd.read_csv(f'{dir_path}/time.csv')

        if "cpu" in DEVICE:
            df_cpu = pd.read_csv(f'{dir_path}/cpu.csv')
            df_mem_cpu = pd.read_csv(f'{dir_path}/mem.csv')
            df_timestamp = sumarise_df_cpu(df_cpu, df_mem_cpu, df_timestamp)

        if "gpu" in DEVICE:
            df_cpu = pd.read_csv(f'{dir_path}/cpu.csv')
            df_mem_cpu = pd.read_csv(f'{dir_path}/mem.csv')
            df_gpu = pd.read_csv(f'{dir_path}/gpu.csv')
            df_mem_gpu = pd.read_csv(f'{dir_path}/mem-gpu.csv')
            df_mem_gpu[df_mem_gpu[' COMMAND'].str.contains('/src/main')]
            df_timestamp = sumarise_df_gpu(df_cpu, df_mem_cpu, df_gpu, df_mem_gpu, df_timestamp)

        # multiply the MEM_CPU by 1000 to get the value in MB
        df_timestamp['MEM_CPU_MIN'] = df_timestamp['MEM_CPU_MIN'] * 1000
        df_timestamp['MEM_CPU_MAX'] = df_timestamp['MEM_CPU_MAX'] * 1000
        df_timestamp['MEM_CPU_MEAN'] = df_timestamp['MEM_CPU_MEAN'] * 1000

        df_timestamp.to_csv(f'{dir_path}/time.csv', index=False)

    except Exception as e:
        print(f"Error on {dir}", e)
        pass


In [15]:
df = pd.DataFrame()
for timestamp_file_path in INPUT_DIR.rglob('**/time.csv'):
    dir, output, threads, experiment, csv = str(timestamp_file_path).split('/')
    threads = threads.split('-')[-1]
    try:
        df_timestamp = pd.read_csv(timestamp_file_path)
        df_timestamp.insert(0, 'directory', timestamp_file_path)
        df_timestamp.insert(1, 'threads', threads.split('-')[-1])
        df_timestamp.insert(2, 'device', DEVICE)
        df = pd.concat([df, df_timestamp], axis=0)
    except Exception as e:
        print(f"Error on {dir}", e)
        pass

#keep only the expected columns:
if "cpu" in DEVICE:
    df = df[['directory','threads','device',
            'PHASE','TIMESTAMP','START_TIME','END_TIME',
            'CPU_MIN','CPU_MAX','CPU_MEAN','MEM_CPU_MIN',
            'MEM_CPU_MAX','MEM_CPU_MEAN',]]

if "gpu" in DEVICE:
    df = df[['directory','threads','device',
            'PHASE','TIMESTAMP','START_TIME','END_TIME',
            'CPU_MIN','CPU_MAX','CPU_MEAN','MEM_CPU_MIN',
            'MEM_CPU_MAX','MEM_CPU_MEAN','GPU_MIN','GPU_MAX',
            'GPU_MEAN','MEM_GPU_MIN','MEM_GPU_MAX','MEM_GPU_MEAN']]

df.sort_values(by=['directory'], inplace=True)
df.reset_index(drop=True, inplace=True)

threads = INPUT_DIR.parts[-1].split('-')[-1]
df.to_csv(f'{OUTPUT_DIR}/{DEVICE}-{threads}.csv', index=False)