# Trace Filter Examples

A trace filter is a callable object which extracts a set of events that match a set of criteria from a Trace Data
Frame. Trace filters are essential to trace analysis as they effectively narrows down the data set to enable more 
efficient and targeted analysis.

A trace filter object operates similarly to a customizable function that is invoked via a consistent interface. 
Users can use or define a Trace Filter class to specify which trace data should be captured or ignored.

The basic use pattern of trace filters is as follows:

```
from hta.common.trace_filter import Filter, IterationIndexFilter, NameFilter

# get df and trace_symbol_table from the parsed tarces. 
df = ...
trace_symbol_table = ...

# extract trace events in the first and scond iterations.
filter_func1 = IterationIndexFilter(iteration_index=[0, 1])
filtered_df1 = filter_func1(df)

# extract trace events whose name starts with "nccl".
filter_func2 = NameFilter(name_pattern=r"^nccl")
filtered_df2 = filter_func2(df, trace_symbol_table)
``` 

In [None]:
import hta
from hta.common.trace import Trace
import os
from pathlib import Path

base_data_dir = str(Path(hta.__file__).parent.parent.joinpath("tests/data"))
trace_dir: str = os.path.join(base_data_dir, "trace_filter")
t = Trace(trace_dir=trace_dir)
t.parse_traces()

In [14]:
df = t.get_trace(0)


In [15]:
df.shape


(243815, 16)

In [17]:
from typing import List
from hta.common.trace import parse_trace_dataframe
import pandas as pd
from hta.common.trace_file import read_trace, write_trace


def extract_trace_events(
    input_trace_file: str, output_trace_file: str, event_indices: List[int]
) -> None:
    """
    Extract the selected events from input_trace_file and write to output_trace_file.

    Args:
        input_trace_file (str): Path to the trace file to be processed.
        output_trace_file (str): Path to the output trace file.
        event_indices: A list of indices corresponding to the events to be extracted.
    """
    raw_trace_data = read_trace(input_trace_file)
    raw_trace_df = pd.DataFrame(raw_trace_data["traceEvents"])
    _, processed_df, _ = parse_trace_dataframe(raw_trace_record=raw_trace_data)

    # Add an 'index' column if not present
    if "index" not in raw_trace_df.columns:
        raw_trace_df.reset_index(inplace=True)
    raw_trace_df["index"] = pd.to_numeric(raw_trace_df["index"], downcast="integer")

    # Separate profiler events and process events
    profiler_events = raw_trace_df.loc[raw_trace_df["cat"].eq("Trace")]
    process_events = raw_trace_df.loc[raw_trace_df["ph"].isin(["M"])]

    # Select the required events
    selected_events = processed_df.loc[processed_df["index"].isin(event_indices)]

    # Combine indices of profiler, process, and selected events
    output_event_index = pd.concat(
        [profiler_events["index"], process_events["index"], selected_events["index"]],
        ignore_index=False,
    ).sort_index()

    output_events = raw_trace_df.loc[output_event_index].copy()

    empty_arg_events = output_events.loc[output_events["args"].isna()]
    output_events.loc[empty_arg_events["index"], "args"] = [
        {"_name": name} for name in empty_arg_events["name"]
    ]

    output_events.drop(columns=["index"], inplace=True)
    raw_trace_data["traceEvents"] = list(
        output_events.apply(lambda row: row.dropna().to_dict(), axis=1)
    )

    write_trace(raw_trace_data, output_trace_file)

In [7]:
t.decode_symbol_ids(use_shorten_name=True)

In [9]:
selected_indices = df.groupby(["iteration", "stream", "cat", "name"]).head(2).index

In [11]:
t.trace_files[0]


'/Users/fengx/Applications/HolisticTraceAnalysis/tests/data/trace_filter/rank-0.json.gz'

In [12]:
extract_trace_events(t.trace_files[0], t.trace_files[0], selected_indices)

TypeError: parse_trace_dataframe() got an unexpected keyword argument 'raw_trace_record'