# Import library

In [None]:
import os
import pandas as pd
from hta.common.trace import Trace
from hta.configs.config import logger
from hta.analyzers.timeline import plot_timeline_gpu_kernels

from pathlib import Path

## Display figures on github

+ The following cell enables the figures visible on github but causes the figure being  non-interactive.

+ To enable interactive viewing of the figures, set `ON_GITHUB = False` when running this notebook outside github.

+ To run all cells to get all the figures.

In [None]:
# ON_GITHUB = True
ON_GITHUB = False
if ON_GITHUB:
    import plotly.io as pio
    pio.renderers.default = "svg"

# Load Trace Data

In [None]:
%%time

# Detect the TraceAnalyzer project path
project_path_candidates=!find ~/ -name TraceAnalyzer
for p in project_path_candidates:
    if os.path.exists(p) and os.path.exists(Path(p).joinpath("hta")):
        base_dir = p
        break
        
# Specify a trace folder
# Ensure the sample datasets exists
assert base_dir is not None
assert os.path.exists(f"{base_dir}/tests/data/vision_transformer")

trace_dir = str(Path(base_dir).joinpath("tests/data/vision_transformer"))
logger.info(f"Using traces from folder {trace_dir}")

# Parse the traces
trace_data = Trace(trace_dir=trace_dir)
trace_data.parse_traces()

# Prepare a DataFrame that contains all the traces

In [None]:
%%time
_ranks = list(trace_data.get_all_traces().keys())
df = pd.concat([trace_data.get_trace(r) for r in _ranks], axis=0, keys=_ranks, names=["rank", "idx"]).reset_index()
symbol_table = trace_data.symbol_table

# Example #1 - Plot GPU kernels on all ranks for one iteration

In [None]:
%%time
plot_timeline_gpu_kernels("Timeline of GPU Kernels (Iteration=15)", df, symbol_table, iterations=[15], duration_threshold=2000)

# Example #2 - Plot compute kernels for two iterations

In [None]:
%%time
s_map = pd.Series(symbol_table.get_sym_id_map())
non_computer_name_ids = s_map[
    s_map.index.str.startswith("ncclKernel")
    | s_map.index.str.startswith("Memset")
    | s_map.index.str.startswith("Memcpy")
].values
compute_df = df.loc[~df["name"].isin(non_computer_name_ids)]
plot_timeline_gpu_kernels("Timeline of Compute Kernels (Iteration = [15, 16])", compute_df, symbol_table, iterations=[15, 16], duration_threshold=2000)

# Example #3 - Plot compute kernels on one stream for one iterations 

In [None]:
s_map = pd.Series(symbol_table.get_sym_id_map())
non_compute_name_ids = s_map[
    s_map.index.str.startswith("ncclKernel")
    | s_map.index.str.startswith("Memset")
    | s_map.index.str.startswith("Memcpy")
].values
compute_df = df.loc[~df["name"].isin(non_compute_name_ids)]
plot_timeline_gpu_kernels("Timeline of Computer Kernels (Iteration=16, Stream=7)", compute_df, symbol_table, iterations=[16], streams=[7], duration_threshold=2000)

# Example 4 - Plot all communication kernels on all iterations

In [None]:
%%time
s_map = pd.Series(symbol_table.get_sym_id_map())
communicate_name_ids = s_map[
    s_map.index.str.startswith("ncclKernel")
].values
communicate_df = df.loc[df["name"].isin(communicate_name_ids)]
plot_timeline_gpu_kernels("Timeline of All Communication Kernels", communicate_df, symbol_table, duration_threshold=2000)

# Example 5 - Plot all-to-all communication kernels on ranks 0, 1, 2

In [None]:
%%time
s_map = pd.Series(symbol_table.get_sym_id_map())
communicate_name_ids = s_map[
    s_map.index.str.startswith("ncclKernel_ReduceScatter")
].values
communicate_df = df.loc[df["name"].isin(communicate_name_ids)]
plot_timeline_gpu_kernels("All-to-All Communication Kernels on Ranks [0, 1, 2]", communicate_df, symbol_table, ranks=[0, 1, 2], duration_threshold=2000)