In [1]:
# import pandas as pd
# import plotly.express as px
# import os

# def convert_latency(value):
#     if 'us' in value:
#         return float(value.replace('us', '')) * 1e-3
#     elif 'ms' in value:
#         return float(value.replace('ms', ''))
#     else:
#         return float(value)

# def plot_data_from_logs(log_folders):
#     all_data = []
#     # Process each log directory
#     for log_folder in log_folders:
#         fs_type = os.path.basename(log_folder)  # Use directory name as file system type
#         for log_file in os.listdir(log_folder):
#             file_path = os.path.join(log_folder, log_file)
#             get_name = os.path.basename(log_file)
#             if os.path.isfile(file_path) and file_path.endswith('.log'):
#                 data = pd.read_csv(file_path, index_col=None, header=0)
#                 data['FileSystem'] = fs_type  # Add a new column for file system type
#                 data['Size'] = get_name
#                 all_data.append(data)
    
#     # Concatenate all dataframes into one
#     df = pd.concat(all_data, axis=0, ignore_index=True)

#     print(df.columns)

#     # Convert latency to consistent units (milliseconds)
#     latency_columns = ['putc_latency', 'put_block_latency', 'rewrite_latency',
#                        'getc_latency', 'get_block_latency', 'seeks_latency',
#                        'seq_create_latency', 'seq_stat_latency', 'seq_del_latency',
#                        'ran_create_latency', 'ran_stat_latency', 'ran_del_latency']
#     for col in latency_columns:
#         if col in df.columns:
#             df[col] = df[col].astype(str).apply(convert_latency)


#     throughput_columns = [
#         # sequential output
#         ('putc', 'putc_latency'),
#         ('put_block', 'put_block_latency'),
#         ('rewrite', 'rewrite_latency'),

#         # sequential input
#         ('getc', 'getc_latency'),
#         ('get_block', 'get_block_latency'),

#         # seeks
#         ('seeks', 'seeks_latency'),

#         # sequential create
#         ('seq_create', 'seq_create_latency'),
#         ('seq_stat', 'seq_stat_latency'),
#         ('seq_del', 'seq_del_latency'),

#         # random create
#         ('ran_create', 'ran_create_latency'),
#         ('ran_stat', 'ran_stat_latency'),
#         ('ran_del', 'ran_del_latency'),
#     ]


    
#     # plot for all sizes
#     for throughput, latency in throughput_columns:
#         if throughput in df.columns and latency in df.columns:
#             fig = px.scatter(df, x=throughput, y=latency, color='FileSystem',
#                              title=f'{throughput} Throughput vs. {latency} Latency -- All',
#                              labels={throughput: f"{throughput} Throughput (K/s)",
#                                      latency: f"{latency} Latency (ms)"},
#                              hover_data=['FileSystem'])
#             fig.update_layout(legend_title_text='File System Type')
#             fig.show()
#         else:
#             print(f'Missing data columns for {throughput} or {latency}')

#     # same but different plot for each size
#     for size in df['Size'].unique():
#         if size == '1.log' or size == '1024.log' or size == '8192.log':
#             for throughput, latency in throughput_columns:
#                 if throughput in df.columns and latency in df.columns:
#                     fig = px.scatter(df[df['Size'] == size], x=throughput, y=latency, color='FileSystem',
#                                     title=f'{throughput} Throughput vs. {latency} Latency -- {size}',
#                                     labels={throughput: f"{throughput} Throughput (K/s)",
#                                             latency: f"{latency} Latency (ms)"},
#                                     hover_data=['FileSystem'])
#                     fig.update_layout(legend_title_text='File System Type')
#                     fig.show()
#                 else:
#                     print(f'Missing data columns for {throughput} or {latency}')


# # List of your log directories, adjust these to match your actual directories
# log_folders = ['./ntfs', './xfs', 'shm']

# # Invoke the function with a list of directories
# plot_data_from_logs(log_folders)


In [4]:
import pandas as pd
import plotly.express as px
import os

def convert_latency(value):
    if 'us' in value:
        return float(value.replace('us', '')) * 1e-3
    elif 'ms' in value:
        return float(value.replace('ms', ''))
    else:
        return float(value)

def plot_data_from_logs(log_folders):
    all_data = []
    test_categories = {
        'putc': 'Sequential Output',
        'put_block': 'Sequential Output',
        'rewrite': 'Sequential Output',
        'getc': 'Sequential Input',
        'get_block': 'Sequential Input',
        'seeks': 'Seeks',
        'seq_create': 'Sequential Create',
        'seq_stat': 'Sequential Create',
        'seq_del': 'Sequential Create',
        'ran_create': 'Random Create',
        'ran_stat': 'Random Create',
        'ran_del': 'Random Create'
    }

    # Process each log directory
    for log_folder in log_folders:
        fs_type = os.path.basename(log_folder)  # Use directory name as file system type
        for log_file in os.listdir(log_folder):
            file_path = os.path.join(log_folder, log_file)
            get_name = os.path.basename(log_file)
            if os.path.isfile(file_path) and file_path.endswith('.log'):
                data = pd.read_csv(file_path, index_col=None, header=0)
                data['FileSystem'] = fs_type  # Add a new column for file system type
                data['Size'] = get_name
                all_data.append(data)
    
    # Concatenate all dataframes into one
    df = pd.concat(all_data, axis=0, ignore_index=True)

    # Convert latency to consistent units (milliseconds)
    latency_columns = [col + '_latency' for col in test_categories]
    for col in latency_columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(convert_latency)

    # Plotting
    throughput_columns = [(key, key + '_latency') for key in test_categories]

    # plot for all sizes
    for throughput, latency in throughput_columns:
        if throughput in df.columns and latency in df.columns:
            fig = px.scatter(df, x=throughput, y=latency, color='FileSystem',
                             title=f'{test_categories[throughput]} : {throughput} -- All Sizes',
                             labels={throughput: f"{throughput} Throughput (K/s)",
                                     latency: f"{latency} Latency (ms)"},
                             hover_data=['FileSystem'])
            fig.update_layout(legend_title_text='File System Type')
            fig.show()

    # spaces
    print('\n\n\n\n===================\n\n\n\n') 

    # same but different plot for each size
    for size in df['Size'].unique():
        if size in ['1.log', '1024.log', '8192.log']:
            for throughput, latency in throughput_columns:
                if throughput in df.columns and latency in df.columns:
                    fig = px.scatter(df[df['Size'] == size], x=throughput, y=latency, color='FileSystem',
                                    title=f'{test_categories[throughput]} : {throughput} -- {size} ',
                                    labels={throughput: f"{throughput} Throughput (K/s)",
                                            latency: f"{latency} Latency (ms)"},
                                    hover_data=['FileSystem'])
                    fig.update_layout(legend_title_text='File System Type')
                    fig.show()

# List of your log directories, adjust these to match your actual directories
log_folders = ['./ntfs', './xfs', './shm']

# Invoke the function with a list of directories
plot_data_from_logs(log_folders)










