In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from IPython.display import display, HTML
import plotly.graph_objects as go
import json
import os
import time
from typing import List, Dict, Any, Tuple
from pathlib import Path

In [10]:
logs_path = Path("logs")

tensor([1, 2, 3])

# Useful functions

In [None]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: Path) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event.dropna(inplace=True)
        df_event["event"] = event_name
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name.name
        df_event["legend"] = f"{event_name} ({name})"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

# Distributed

In [4]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.json"))
workers_files.sort()

workers_events_dfs: List[pd.DataFrame] = []
workers_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-4]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["worker"] = worker
        df["log"] = log
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
workers_df = pd.concat(workers_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_df)
print(workers_events_pairs)
workers_df = compute_time_elapsed(workers_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_df)

[('start.epoch', 'end.epoch'), ('start.calc_gradients', 'end.calc_gradients'), ('start.recv_data', 'end.recv_data'), ('start.send', 'end.send'), ('start.swap_recv_instruction', 'end.swap_recv_instruction'), ('start.load_state_dict', 'end.load_state_dict'), ('start.swap_recv', 'end.swap_recv'), ('start.swap_send', 'end.swap_send')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-18 12:49:58.573754787,2024-05-18 12:49:59.203000069,epoch,0.629245,mdgan.4.CIFAR10.worker.1.logs.json,epoch (logs/mdgan.4.CIFAR10.worker.1.logs.json),0
1,2024-05-18 12:49:59.203452826,2024-05-18 12:50:02.506414174,epoch,3.302961,mdgan.4.CIFAR10.worker.1.logs.json,epoch (logs/mdgan.4.CIFAR10.worker.1.logs.json),1
2,2024-05-18 12:50:02.506812811,2024-05-18 12:50:02.579381943,epoch,0.072569,mdgan.4.CIFAR10.worker.1.logs.json,epoch (logs/mdgan.4.CIFAR10.worker.1.logs.json),2
3,2024-05-18 12:50:02.579784870,2024-05-18 12:50:02.640255928,epoch,0.060471,mdgan.4.CIFAR10.worker.1.logs.json,epoch (logs/mdgan.4.CIFAR10.worker.1.logs.json),3
4,2024-05-18 12:50:02.640681028,2024-05-18 12:50:02.696606874,epoch,0.055926,mdgan.4.CIFAR10.worker.1.logs.json,epoch (logs/mdgan.4.CIFAR10.worker.1.logs.json),4
...,...,...,...,...,...,...,...
143,2024-05-18 12:50:17.730485916,2024-05-18 12:50:17.739791870,send,0.009306,mdgan.4.CIFAR10.worker.4.logs.json,send (logs/mdgan.4.CIFAR10.worker.4.logs.json),143
100,2024-05-18 12:50:12.061578035,2024-05-18 12:50:12.074746132,swap_recv_instruction,0.013168,mdgan.4.CIFAR10.worker.4.logs.json,swap_recv_instruction (logs/mdgan.4.CIFAR10.wo...,100
100,2024-05-18 12:50:12.091194868,2024-05-18 12:50:12.102967024,load_state_dict,0.011772,mdgan.4.CIFAR10.worker.4.logs.json,load_state_dict (logs/mdgan.4.CIFAR10.worker.4...,100
100,2024-05-18 12:50:12.088737011,2024-05-18 12:50:12.091130018,swap_recv,0.002393,mdgan.4.CIFAR10.worker.4.logs.json,swap_recv (logs/mdgan.4.CIFAR10.worker.4.logs....,100


Unnamed: 0,epoch,start.epoch,end.epoch,start.calc_gradients,end.calc_gradients,start.recv_data,end.recv_data,start.send,end.send,start.swap_recv_instruction,...,log,world_size,time_elapsed.epoch,time_elapsed.calc_gradients,time_elapsed.recv_data,time_elapsed.send,time_elapsed.swap_recv_instruction,time_elapsed.load_state_dict,time_elapsed.swap_recv,time_elapsed.swap_send
0,0,2024-05-18 12:49:58.573754787,2024-05-18 12:49:59.203000069,2024-05-18 12:49:58.701009989,2024-05-18 12:49:59.189142942,2024-05-18 12:49:58.576512098,2024-05-18 12:49:58.701000929,2024-05-18 12:49:59.189144135,2024-05-18 12:49:59.202999115,NaT,...,logs/mdgan.4.CIFAR10.worker.1.logs.json,4,0.629245,0.488133,0.124489,0.013855,,,,
1,1,2024-05-18 12:49:59.203452826,2024-05-18 12:50:02.506414174,2024-05-18 12:50:02.456840038,2024-05-18 12:50:02.497569084,2024-05-18 12:49:59.205319881,2024-05-18 12:50:02.456837177,2024-05-18 12:50:02.497569084,2024-05-18 12:50:02.506412029,NaT,...,logs/mdgan.4.CIFAR10.worker.1.logs.json,4,3.302961,0.040729,3.251517,0.008843,,,,
2,2,2024-05-18 12:50:02.506812811,2024-05-18 12:50:02.579381943,2024-05-18 12:50:02.547446012,2024-05-18 12:50:02.570080996,2024-05-18 12:50:02.508537054,2024-05-18 12:50:02.547442913,2024-05-18 12:50:02.570081949,2024-05-18 12:50:02.579381943,NaT,...,logs/mdgan.4.CIFAR10.worker.1.logs.json,4,0.072569,0.022635,0.038906,0.009300,,,,
3,3,2024-05-18 12:50:02.579784870,2024-05-18 12:50:02.640255928,2024-05-18 12:50:02.612402916,2024-05-18 12:50:02.632281065,2024-05-18 12:50:02.581574917,2024-05-18 12:50:02.612400055,2024-05-18 12:50:02.632281065,2024-05-18 12:50:02.640254974,NaT,...,logs/mdgan.4.CIFAR10.worker.1.logs.json,4,0.060471,0.019878,0.030825,0.007974,,,,
4,4,2024-05-18 12:50:02.640681028,2024-05-18 12:50:02.696606874,2024-05-18 12:50:02.671496868,2024-05-18 12:50:02.690617085,2024-05-18 12:50:02.642319918,2024-05-18 12:50:02.671494007,2024-05-18 12:50:02.690617085,2024-05-18 12:50:02.696604967,NaT,...,logs/mdgan.4.CIFAR10.worker.1.logs.json,4,0.055926,0.019120,0.029174,0.005988,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,139,2024-05-18 12:50:17.437331915,2024-05-18 12:50:17.489758968,2024-05-18 12:50:17.467630148,2024-05-18 12:50:17.485482931,2024-05-18 12:50:17.439098835,2024-05-18 12:50:17.467628002,2024-05-18 12:50:17.485484123,2024-05-18 12:50:17.489758015,NaT,...,logs/mdgan.4.CIFAR10.worker.4.logs.json,4,0.052427,0.017853,0.028529,0.004274,,,,
140,140,2024-05-18 12:50:17.492928982,2024-05-18 12:50:17.555788994,2024-05-18 12:50:17.527674198,2024-05-18 12:50:17.546792030,2024-05-18 12:50:17.494642019,2024-05-18 12:50:17.527671099,2024-05-18 12:50:17.546792984,2024-05-18 12:50:17.555788040,NaT,...,logs/mdgan.4.CIFAR10.worker.4.logs.json,4,0.062860,0.019118,0.033029,0.008995,,,,
141,141,2024-05-18 12:50:17.559750795,2024-05-18 12:50:17.613753080,2024-05-18 12:50:17.588660002,2024-05-18 12:50:17.606897831,2024-05-18 12:50:17.561520815,2024-05-18 12:50:17.588656902,2024-05-18 12:50:17.606899023,2024-05-18 12:50:17.613752127,NaT,...,logs/mdgan.4.CIFAR10.worker.4.logs.json,4,0.054002,0.018238,0.027136,0.006853,,,,
142,142,2024-05-18 12:50:17.616926908,2024-05-18 12:50:17.677464008,2024-05-18 12:50:17.650402069,2024-05-18 12:50:17.669045925,2024-05-18 12:50:17.618542194,2024-05-18 12:50:17.650398970,2024-05-18 12:50:17.669045925,2024-05-18 12:50:17.677462101,NaT,...,logs/mdgan.4.CIFAR10.worker.4.logs.json,4,0.060537,0.018644,0.031857,0.008416,,,,


In [5]:
server_files = list(logs_path.glob("*.*.*.server.logs.json"))
server_files.sort()

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-3]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_json(f)
        df["dataset"] = dataset
        df["world_size"] = world_size
        df["log"] = log
        df = convert_all_pairs_to_datetime(df)
        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.send_data', 'end.send_data'), ('start.recv_data', 'end.recv_data'), ('start.calc_gradients', 'end.calc_gradients'), ('start.apply_gradients', 'end.apply_gradients'), ('start.generate_data', 'end.generate_data'), ('start.fid', 'end.fid'), ('start.is', 'end.is'), ('start.swap', 'end.swap')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-18 12:49:58.573624134,2024-05-18 12:50:02.425014019,epoch,3.851390,mdgan.4.CIFAR10.server.logs.json,epoch (logs/mdgan.4.CIFAR10.server.logs.json),0
1,2024-05-18 12:50:02.425466776,2024-05-18 12:50:02.534054995,epoch,0.108588,mdgan.4.CIFAR10.server.logs.json,epoch (logs/mdgan.4.CIFAR10.server.logs.json),1
2,2024-05-18 12:50:02.534476042,2024-05-18 12:50:02.593872786,epoch,0.059397,mdgan.4.CIFAR10.server.logs.json,epoch (logs/mdgan.4.CIFAR10.server.logs.json),2
3,2024-05-18 12:50:02.594260931,2024-05-18 12:50:02.653196096,epoch,0.058935,mdgan.4.CIFAR10.server.logs.json,epoch (logs/mdgan.4.CIFAR10.server.logs.json),3
4,2024-05-18 12:50:02.653635025,2024-05-18 12:50:02.711713076,epoch,0.058078,mdgan.4.CIFAR10.server.logs.json,epoch (logs/mdgan.4.CIFAR10.server.logs.json),4
...,...,...,...,...,...,...,...
100,2024-05-18 12:50:12.509105920,2024-05-18 12:50:15.019316912,fid,2.510211,mdgan.4.CIFAR10.server.logs.json,fid (logs/mdgan.4.CIFAR10.server.logs.json),100
0,2024-05-18 12:49:59.509463071,2024-05-18 12:49:59.939698935,is,0.430236,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),0
50,2024-05-18 12:50:05.665454149,2024-05-18 12:50:06.226917028,is,0.561463,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),50
100,2024-05-18 12:50:12.093562841,2024-05-18 12:50:12.509104012,is,0.415541,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),100


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,start.send_data,end.send_data,start.recv_data,end.recv_data,start.calc_gradients,...,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.send_data,time_elapsed.recv_data,time_elapsed.calc_gradients,time_elapsed.apply_gradients,time_elapsed.generate_data,time_elapsed.fid,time_elapsed.is,time_elapsed.swap
0,0,2024-05-18 12:49:58.573624134,2024-05-18 12:50:02.425014019,2024-05-18 12:49:58.573624134,2024-05-18 12:49:59.504447937,2024-05-18 12:49:58.685534000,2024-05-18 12:49:58.701365948,2024-05-18 12:49:58.701389074,2024-05-18 12:49:59.231646061,2024-05-18 12:49:59.232132196,...,3.851390,0.930824,0.015832,0.530257,0.144914,0.127286,0.111905,2.459337,0.430236,
1,1,2024-05-18 12:50:02.425466776,2024-05-18 12:50:02.534054995,2024-05-18 12:50:02.425466776,2024-05-18 12:50:02.534054995,2024-05-18 12:50:02.429689169,2024-05-18 12:50:02.457438946,2024-05-18 12:50:02.457514048,2024-05-18 12:50:02.509613037,2024-05-18 12:50:02.509999037,...,0.108588,0.108588,0.027750,0.052099,0.011614,0.012332,0.004215,,,
2,2,2024-05-18 12:50:02.534476042,2024-05-18 12:50:02.593872786,2024-05-18 12:50:02.534476042,2024-05-18 12:50:02.593872786,2024-05-18 12:50:02.536020041,2024-05-18 12:50:02.548504114,2024-05-18 12:50:02.548521996,2024-05-18 12:50:02.579642773,2024-05-18 12:50:02.580418110,...,0.059397,0.059397,0.012484,0.031121,0.010426,0.002920,0.001542,,,
3,3,2024-05-18 12:50:02.594260931,2024-05-18 12:50:02.653196096,2024-05-18 12:50:02.594260931,2024-05-18 12:50:02.653196096,2024-05-18 12:50:02.596010923,2024-05-18 12:50:02.614262104,2024-05-18 12:50:02.614284992,2024-05-18 12:50:02.641333103,2024-05-18 12:50:02.641757965,...,0.058935,0.058935,0.018251,0.027048,0.008543,0.002806,0.001747,,,
4,4,2024-05-18 12:50:02.653635025,2024-05-18 12:50:02.711713076,2024-05-18 12:50:02.653635025,2024-05-18 12:50:02.711713076,2024-05-18 12:50:02.655022144,2024-05-18 12:50:02.672260046,2024-05-18 12:50:02.672283888,2024-05-18 12:50:02.699651003,2024-05-18 12:50:02.700057030,...,0.058078,0.058078,0.017238,0.027367,0.008843,0.002717,0.001382,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,138,2024-05-18 12:50:17.390932083,2024-05-18 12:50:17.448736906,2024-05-18 12:50:17.390932083,2024-05-18 12:50:17.448735952,2024-05-18 12:50:17.392491102,2024-05-18 12:50:17.406517029,2024-05-18 12:50:17.406732082,2024-05-18 12:50:17.434292078,2024-05-18 12:50:17.434828043,...,0.057805,0.057804,0.014026,0.027560,0.010811,0.002999,0.001554,,,
139,139,2024-05-18 12:50:17.453231096,2024-05-18 12:50:17.507412910,2024-05-18 12:50:17.453231096,2024-05-18 12:50:17.507412910,2024-05-18 12:50:17.454609871,2024-05-18 12:50:17.467134953,2024-05-18 12:50:17.467163801,2024-05-18 12:50:17.494704962,2024-05-18 12:50:17.495109081,...,0.054182,0.054182,0.012525,0.027541,0.009262,0.002951,0.001377,,,
140,140,2024-05-18 12:50:17.511292934,2024-05-18 12:50:17.569319010,2024-05-18 12:50:17.511292934,2024-05-18 12:50:17.569319010,2024-05-18 12:50:17.512588024,2024-05-18 12:50:17.527181149,2024-05-18 12:50:17.527240037,2024-05-18 12:50:17.555945873,2024-05-18 12:50:17.556730032,...,0.058026,0.058026,0.014593,0.028706,0.009483,0.003007,0.001294,,,
141,141,2024-05-18 12:50:17.573210955,2024-05-18 12:50:17.629283190,2024-05-18 12:50:17.573210955,2024-05-18 12:50:17.629283190,2024-05-18 12:50:17.574649096,2024-05-18 12:50:17.588155985,2024-05-18 12:50:17.588188887,2024-05-18 12:50:17.616772175,2024-05-18 12:50:17.617376089,...,0.056072,0.056072,0.013507,0.028583,0.009141,0.002664,0.001437,,,


In [6]:
data_size = server_df["size.data"].iloc[0]
feedback_size = server_df["size.feedback"].iloc[0]
model_size = workers_df["size.model"].iloc[0]

print(f"Data size: {data_size:.2f}MB")
print(f"Feedback size: {feedback_size:.2f}MB")
print(f"Model size: {model_size:.2f}MB")

Data size: 4.69MB
Feedback size: 0.47MB
Model size: 2.53MB


In [7]:
logs_standalones = list(logs_path.glob("*.standalone.logs.json"))

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df["dataset"] = dataset
        df["log"] = log.name
        df = convert_all_pairs_to_datetime(df)
        
        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), log))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

diff_time: -1 days +23:57:18.736813068
[('start.epoch', 'end.epoch'), ('start.epoch_calculation', 'end.epoch_calculation'), ('start.train', 'end.train'), ('start.fid', 'end.fid'), ('start.is', 'end.is')]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["event"] = event_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.is,fid,is,dataset,log,time_elapsed.epoch,time_elapsed.epoch_calculation,time_elapsed.train,time_elapsed.fid,time_elapsed.is
0,0,2024-05-18 12:49:58.573624066,2024-05-18 12:50:02.640156201,2024-05-18 12:49:58.573624066,2024-05-18 12:49:58.973466090,0,1.472082,2.691529,2024-05-18 12:49:58.973466090,NaT,...,2024-05-18 12:50:02.640106848,427.379852,1.097491,CIFAR10,CIFAR10.standalone.logs.json,4.066532,0.399842,,3.142883,0.507018
1,1,2024-05-18 12:50:02.673102072,2024-05-18 12:50:02.717674902,2024-05-18 12:50:02.673103026,2024-05-18 12:50:02.717636040,1,1.784279,2.786648,2024-05-18 12:50:02.717636040,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.044573,0.044533,,,
2,2,2024-05-18 12:50:02.745737007,2024-05-18 12:50:02.775465897,2024-05-18 12:50:02.745737961,2024-05-18 12:50:02.775424889,2,1.427946,2.999816,2024-05-18 12:50:02.775424889,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.029729,0.029687,,,
3,3,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839507988,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839470080,3,1.610729,3.494141,2024-05-18 12:50:02.839470080,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.033559,0.033521,,,
4,4,2024-05-18 12:50:02.875808886,2024-05-18 12:50:02.916915825,2024-05-18 12:50:02.875810078,2024-05-18 12:50:02.916880062,4,1.541870,3.350138,2024-05-18 12:50:02.916880062,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.041107,0.041070,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,118,2024-05-18 12:50:17.463212899,2024-05-18 12:50:17.500127962,2024-05-18 12:50:17.463212899,2024-05-18 12:50:17.500092200,118,0.336449,7.620221,2024-05-18 12:50:17.500092200,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.036915,0.036879,,,
119,119,2024-05-18 12:50:17.527247122,2024-05-18 12:50:17.568334034,2024-05-18 12:50:17.527247122,2024-05-18 12:50:17.568292073,119,0.079244,4.573243,2024-05-18 12:50:17.568292073,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.041087,0.041045,,,
120,120,2024-05-18 12:50:17.597579888,2024-05-18 12:50:17.637738875,2024-05-18 12:50:17.597579888,2024-05-18 12:50:17.637700012,120,0.188583,5.797953,2024-05-18 12:50:17.637700012,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.040159,0.040120,,,
121,121,2024-05-18 12:50:17.666503838,2024-05-18 12:50:17.695207766,2024-05-18 12:50:17.666504792,2024-05-18 12:50:17.695171049,121,0.115188,4.437811,2024-05-18 12:50:17.695171049,NaT,...,NaT,,,CIFAR10,CIFAR10.standalone.logs.json,0.028704,0.028666,,,


Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-18 12:49:58.573624066,2024-05-18 12:50:02.640156201,epoch,4.066532,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),0
1,2024-05-18 12:50:02.673102072,2024-05-18 12:50:02.717674902,epoch,0.044573,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),1
2,2024-05-18 12:50:02.745737007,2024-05-18 12:50:02.775465897,epoch,0.029729,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),2
3,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839507988,epoch,0.033559,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),3
4,2024-05-18 12:50:02.875808886,2024-05-18 12:50:02.916915825,epoch,0.041107,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),4
...,...,...,...,...,...,...,...
50,2024-05-18 12:50:05.905620983,2024-05-18 12:50:08.810707978,fid,2.905087,CIFAR10.standalone.logs.json,fid (logs/CIFAR10.standalone.logs.json),50
100,2024-05-18 12:50:12.551649979,2024-05-18 12:50:15.791346958,fid,3.239697,CIFAR10.standalone.logs.json,fid (logs/CIFAR10.standalone.logs.json),100
0,2024-05-18 12:50:02.133088997,2024-05-18 12:50:02.640106848,is,0.507018,CIFAR10.standalone.logs.json,is (logs/CIFAR10.standalone.logs.json),0
50,2024-05-18 12:50:08.810708931,2024-05-18 12:50:09.358181170,is,0.547472,CIFAR10.standalone.logs.json,is (logs/CIFAR10.standalone.logs.json),50


In [8]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_df, server_df])
display(all_events_df)
display(all_df)

Unnamed: 0,start,end,event,time_elapsed,name,legend,index
0,2024-05-18 12:49:58.573624066,2024-05-18 12:50:02.640156201,epoch,4.066532,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),0
1,2024-05-18 12:50:02.673102072,2024-05-18 12:50:02.717674902,epoch,0.044573,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),1
2,2024-05-18 12:50:02.745737007,2024-05-18 12:50:02.775465897,epoch,0.029729,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),2
3,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839507988,epoch,0.033559,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),3
4,2024-05-18 12:50:02.875808886,2024-05-18 12:50:02.916915825,epoch,0.041107,CIFAR10.standalone.logs.json,epoch (logs/CIFAR10.standalone.logs.json),4
...,...,...,...,...,...,...,...
100,2024-05-18 12:50:12.509105920,2024-05-18 12:50:15.019316912,fid,2.510211,mdgan.4.CIFAR10.server.logs.json,fid (logs/mdgan.4.CIFAR10.server.logs.json),100
0,2024-05-18 12:49:59.509463071,2024-05-18 12:49:59.939698935,is,0.430236,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),0
50,2024-05-18 12:50:05.665454149,2024-05-18 12:50:06.226917028,is,0.561463,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),50
100,2024-05-18 12:50:12.093562841,2024-05-18 12:50:12.509104012,is,0.415541,mdgan.4.CIFAR10.server.logs.json,is (logs/mdgan.4.CIFAR10.server.logs.json),100


Unnamed: 0,epoch,start.epoch,end.epoch,start.epoch_calculation,end.epoch_calculation,absolut_step,mean_d_loss,mean_g_loss,start.train,end.train,...,end.generate_data,size.data,size.feedback,start.swap,end.swap,swap,time_elapsed.send_data,time_elapsed.apply_gradients,time_elapsed.generate_data,time_elapsed.swap
0,0,2024-05-18 12:49:58.573624066,2024-05-18 12:50:02.640156201,2024-05-18 12:49:58.573624066,2024-05-18 12:49:58.973466090,0.0,1.472082,2.691529,2024-05-18 12:49:58.973466090,NaT,...,NaT,,,NaT,NaT,,,,,
1,1,2024-05-18 12:50:02.673102072,2024-05-18 12:50:02.717674902,2024-05-18 12:50:02.673103026,2024-05-18 12:50:02.717636040,1.0,1.784279,2.786648,2024-05-18 12:50:02.717636040,NaT,...,NaT,,,NaT,NaT,,,,,
2,2,2024-05-18 12:50:02.745737007,2024-05-18 12:50:02.775465897,2024-05-18 12:50:02.745737961,2024-05-18 12:50:02.775424889,2.0,1.427946,2.999816,2024-05-18 12:50:02.775424889,NaT,...,NaT,,,NaT,NaT,,,,,
3,3,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839507988,2024-05-18 12:50:02.805948904,2024-05-18 12:50:02.839470080,3.0,1.610729,3.494141,2024-05-18 12:50:02.839470080,NaT,...,NaT,,,NaT,NaT,,,,,
4,4,2024-05-18 12:50:02.875808886,2024-05-18 12:50:02.916915825,2024-05-18 12:50:02.875810078,2024-05-18 12:50:02.916880062,4.0,1.541870,3.350138,2024-05-18 12:50:02.916880062,NaT,...,NaT,,,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,138,2024-05-18 12:50:17.390932083,2024-05-18 12:50:17.448736906,2024-05-18 12:50:17.390932083,2024-05-18 12:50:17.448735952,,,,NaT,NaT,...,2024-05-18 12:50:17.392486811,4.6875,0.46875,NaT,NaT,False,0.014026,0.002999,0.001554,
139,139,2024-05-18 12:50:17.453231096,2024-05-18 12:50:17.507412910,2024-05-18 12:50:17.453231096,2024-05-18 12:50:17.507412910,,,,NaT,NaT,...,2024-05-18 12:50:17.454609871,4.6875,0.46875,NaT,NaT,False,0.012525,0.002951,0.001377,
140,140,2024-05-18 12:50:17.511292934,2024-05-18 12:50:17.569319010,2024-05-18 12:50:17.511292934,2024-05-18 12:50:17.569319010,,,,NaT,NaT,...,2024-05-18 12:50:17.512588024,4.6875,0.46875,NaT,NaT,False,0.014593,0.003007,0.001294,
141,141,2024-05-18 12:50:17.573210955,2024-05-18 12:50:17.629283190,2024-05-18 12:50:17.573210955,2024-05-18 12:50:17.629283190,,,,NaT,NaT,...,2024-05-18 12:50:17.574649096,4.6875,0.46875,NaT,NaT,False,0.013507,0.002664,0.001437,


In [16]:
all_df_size = all_df[["log", "epoch", "size.sent", "size.recv"]]
all_df_size = all_df_size.dropna()
px.line(all_df_size, x="epoch", y=["size.sent", "size.recv"], color="log", title="Size sent and received per epoch")

In [20]:
all_df_size.groupby("log").mean()[["size.sent", "size.recv"]]

Unnamed: 0_level_0,size.sent,size.recv
log,Unnamed: 1_level_1,Unnamed: 2_level_1
logs/mdgan.4.CIFAR10.server.logs.json,0.9375,0.46875
logs/mdgan.4.CIFAR10.worker.1.logs.json,0.134779,0.251967
logs/mdgan.4.CIFAR10.worker.2.logs.json,0.134779,0.251967
logs/mdgan.4.CIFAR10.worker.3.logs.json,0.134779,0.251967
logs/mdgan.4.CIFAR10.worker.4.logs.json,0.134779,0.251967


In [10]:
px.line(standalone_df, x="epoch", y=["mean_d_loss", "mean_g_loss"], title="Losses standalone", template="plotly_white").show()
px.line(all_df, x="epoch", y=["mean_d_loss"], color="log", title="Losses discriminators", template="plotly_white").show()
px.line(all_df[["epoch", "log", "fid"]].dropna(), x="epoch", y=["fid"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df[["epoch", "log", "is"]].dropna(), x="epoch", y=["is"], color="log", title="Losses", template="plotly_white").show()
px.line(all_df, x="epoch", y=["time_elapsed.epoch_calculation"], color="log", title="Epoch duration", template="plotly_white").show()

In [11]:
mean_time_elapsed = server_events_df[["legend", "time_elapsed"]].groupby("legend").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="legend", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="legend", title="Mean time elapsed", template="plotly_white").show()

In [12]:
mean_time_elapsed = workers_events_df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
px.bar(mean_time_elapsed, y="time_elapsed", title="Mean time elapsed", color="event", template="plotly_white", text_auto=True).show()
px.pie(mean_time_elapsed, values="time_elapsed", names="event", title="Mean time elapsed", template="plotly_white").show()

In [13]:
timeline = px.timeline(
    all_events_df,
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
)

timeline.show()