In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("mobnet_bottleneck_layer_bs_32_epoch_20.ncu-rep.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['Demangled Name'].unique().tolist()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
ax = df['Demangled Name'].value_counts().plot(kind='bar')
plt.xlabel('Kernel Names')
plt.ylabel('Counts')
plt.yscale('log', base=10)
plt.title('Histogram of Kernel Names')

# Annotate the value on top of each bar
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.05))

plt.show()

In [None]:
# Initialize an empty dictionary to store the occurrences
occurrences = {}

# Iterate over each unique kernel name in the dataframe
for kernel in df['Demangled Name'].unique():
        # Get the indices of occurrences in the original dataframe
        indices = df[df['Demangled Name'] == kernel].index
        occurrences[kernel] = indices.tolist()
        print(f"{kernel}: {indices.tolist()}")

In [None]:
df.loc[:15, 'Demangled Name']

In [None]:
df.loc[16:31, 'Demangled Name']

In [None]:
# Get the value counts of 'Demangled Name'
value_counts = df['Demangled Name'].value_counts()

# Filter the values that occur exactly 10 times
filtered_values = value_counts[value_counts == 20].index

# Print the filtered values
filtered_values, len(filtered_values)

In [None]:
value_counts.index

In [None]:
# Iterate over each value in filtered_values
# occurrences = {}
# for value in filtered_values:
#     # Get the indices of occurrences in the original dataframe
#     indices = df[df['Demangled Name'] == value].index
#     occurrences[value] = indices.tolist()
#     print(f"Indices for {value}: {indices.tolist()}")

# occurrences = {}
# for value in value_counts.index:
#     indices = df[df['Demangled Name'] == value].index
#     occurrences[value] = indices.tolist()
#     print(f"Indices for {value}: {indices.tolist()}")

In [None]:
df.loc[0, 'Demangled Name']

In [None]:
col = ""
for i in df.columns:
    if "gpu__time_duration.sum" in i:
        col = i
col

In [None]:
def _ncu_get_flops_double(kernel_data: dict, col: str) -> float:
    flops = (kernel_data['smsp__sass_thread_inst_executed_op_dadd_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['smsp__sass_thread_inst_executed_op_dmul_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['derived__smsp__sass_thread_inst_executed_op_dfma_pred_on_x2 [inst]']) \
            * kernel_data['smsp__cycles_elapsed.avg.per_second [Ghz]'] \
            * kernel_data[col] 

    return flops


def _ncu_get_flops_single(kernel_data: dict, col: str, breakdown=None) -> float:
    flops = (kernel_data['smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['derived__smsp__sass_thread_inst_executed_op_ffma_pred_on_x2 [inst]']) \
            * kernel_data['smsp__cycles_elapsed.avg.per_second [Ghz]'] \
            * kernel_data[col]
    
    if breakdown:
        return flops, kernel_data['smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed [inst/cycle]'], kernel_data['smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed [inst/cycle]'], kernel_data['derived__smsp__sass_thread_inst_executed_op_ffma_pred_on_x2 [inst]']
    
    return flops


def _ncu_get_flops_half(kernel_data: dict, col: str) -> float:
    flops = (kernel_data['smsp__sass_thread_inst_executed_op_hadd_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['smsp__sass_thread_inst_executed_op_hmul_pred_on.sum.per_cycle_elapsed [inst/cycle]'] \
                + kernel_data['derived__smsp__sass_thread_inst_executed_op_hfma_pred_on_x2 [inst]']) \
            * kernel_data['smsp__cycles_elapsed.avg.per_second [Ghz]'] \
            * kernel_data[col]

    return flops


def _have_strings(name: str, *strings):
    return any(s in name for s in strings)


def _ncu_get_flops_tensor(kernel_data: dict, col: str, breakdown=None) -> float:
    kernel_name = kernel_data['Function Name']

    factor = 2048   # default (volta), 8x8x4 x 2 OP/FMA

    # ampere (A100 etc) fp16
    if _have_strings(kernel_name, '16816', 'tensor16x8x16'):
        factor = 4096
    elif _have_strings(kernel_name, '1688', 'tensor16x8x8'):
        factor = 2048

    # ampere (A100 etc) int8
    elif _have_strings(kernel_name, 'i8i8_i8i32_f32') \
        and _have_strings(kernel_name, 'tensor16x8x32'):
        factor = 8192
    elif _have_strings(kernel_name, 'i8i8_i32_f32'):
        factor = 8192
    elif _have_strings(kernel_name, 'i8i8_i8i32_f32') \
        and _have_strings(kernel_name, 'tensor8x8x16'):
        factor = 2048
    elif _have_strings(kernel_name, 'imma') and _have_strings(kernel_name, 'ampere'):    # ampere_first_layer_filter3x3_imma_fwd_swish_execute_filter3x3_swish_kernel_trt
        factor = 2048

    # TODO: need to verify
    # volta (V100 etc), HMMA.884.F16.F16 fix
    elif (
            (_have_strings(kernel_name, 'h884') or
                (_have_strings(kernel_name, 'f16f16_f16f16_f16') and _have_strings(kernel_name, 'tensor8x8x4'))
            ) and not _have_strings(kernel_name, 's884')
        ):
        factor = 1024
        
    #print(factor)
    flops = kernel_data['smsp__inst_executed_pipe_tensor.sum.per_cycle_elapsed [inst/cycle]'] \
            * factor \
            * kernel_data['smsp__cycles_elapsed.avg.per_second [Ghz]'] \
            * kernel_data[col]
    if breakdown:
        return flops, kernel_data['smsp__inst_executed_pipe_tensor.sum.per_cycle_elapsed [inst/cycle]'], factor
    return flops


def ncu_get_flops(kernel_data: dict, data_width: int, col: str) -> float:
    """return all double/single/half/tensor FLOPs (count of FLoat OP)"""
    double = _ncu_get_flops_double(kernel_data, col)
    single, fadd, fmul, ffma = _ncu_get_flops_single(kernel_data, col, breakdown=True)
    half = _ncu_get_flops_half(kernel_data, col)
    tensor, tensor_sum, factor = _ncu_get_flops_tensor(kernel_data, col, breakdown=True)
    
    # if _have_strings(kernel_data['Demangled Name'], 'ampere_sgemm_128x32_tn'):
    #     print(f"{kernel_data['Function Name']}")
    #     print(f"double: {double}")
    #     print(f"single: {single}")
    #     print(fadd, fmul, ffma)
    #     print(f"half: {half}")
    #     print(f"tensor: {tensor}")
    #     print(tensor_sum, factor)
    
    """if tensor == 0:
        print(f"{kernel_data['Function Name']}")
        print(f"single: {single}")
        print(fadd, fmul, ffma)
        print("-"*50)

    if (single != 0) and (tensor != 0):
        print(f"{kernel_data['Function Name']}")
        print(f"single: {single}")
        print(fadd, fmul, ffma)
        print(f"tensor: {tensor}")
        print(tensor_sum, factor)
        print("*"*50)"""
        
    all_flops = (
        _ncu_get_flops_double(kernel_data, col),
        _ncu_get_flops_single(kernel_data, col),
        _ncu_get_flops_half(kernel_data, col),
        _ncu_get_flops_tensor(kernel_data, col)
    )
    flops = sum(all_flops)
    #return sum(flops)
    """if not flops:
        flops = _ncu_get_flops_fallback(kernel_data, data_width)    # not good!"""
    return flops, all_flops[-1]

In [None]:
df['FLOPs'] = df.apply(lambda row: ncu_get_flops(row, 32, col)[0], axis=1)
kernel_flops_sum = df.groupby('Demangled Name')['FLOPs'].sum()
print(kernel_flops_sum)

# # print('=====================')
# # df['Tensor FLOPs'] = df.apply(lambda row: ncu_get_flops(row, 32, col)[1], axis=1)
# # kernel_tensor_flops_sum = df.groupby('Demangled Name')['Tensor FLOPs'].sum()
# # print(kernel_tensor_flops_sum)

In [None]:
# Group by 'Demangled Name' and sum the 'FLOPs' for each group
kernel_flops_sum = df.groupby('Demangled Name')['FLOPs'].sum()

# Sort the results in descending order

kernel_flops_sum_sorted = kernel_flops_sum.sort_values(ascending=False)

# Print the sorted results
kernel_flops_sum_sorted / 1e6 / 20

In [None]:
df.shape[0] / 20

In [None]:
kernel_flops_sum_sorted.sum() / 1e6 / 20

In [None]:
filtered_df = df[["Demangled Name", "dram__bytes.sum.per_second [Gbyte/s]", col]]
filtered_df["mem"] = df["dram__bytes.sum.per_second [Gbyte/s]"] * df[col]
# Group by 'Demangled Name' and sum the 'mem' for each group
kernel_mem_sum = filtered_df.groupby('Demangled Name')['mem'].sum() / 1e3 / 20

# Sort the results in descending order
kernel_mem_sum_sorted = kernel_mem_sum.sort_values(ascending=False)

# Print the sorted results
print(kernel_mem_sum_sorted)

In [None]:
# indices and range selected manually for now
all_flops = 0
for i in range(44):
    f = ncu_get_flops(df.iloc[i], 32, col)
    all_flops += f
        
print('FLOPS:', all_flops/1e6)
print('\n\n')

x = 1
for j in range(19):
    x += 43
    all_flops = 0
    for i in range(x, x+43):
        f = ncu_get_flops(df.iloc[i], 32, col)
        all_flops += f
            
    print('FLOPS:', all_flops/1e6)
    print('\n\n')

In [None]:
fieldnames = [
    'Demangled Name',
    'smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed [inst/cycle]',
    'smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed [inst/cycle]',
    'derived__smsp__sass_thread_inst_executed_op_ffma_pred_on_x2 [inst]',
    'smsp__cycles_elapsed.avg.per_second [Ghz]',
    col
]

In [None]:
from prettytable import PrettyTable

# Initialize an empty list to store the data
data = []

# Iterate over each kernel in value_counts
for kernel in value_counts.index:
    kernel_flops = 0
    for idx in occurrences[kernel]:
        flops = ncu_get_flops(df.iloc[idx], 32, col)
        kernel_flops += flops
    
    # Collect the attributes for the kernel
    attributes = {field: df.iloc[occurrences[kernel][0]][field] for field in fieldnames}
    attributes['FLOPs (GFLOPs)'] = kernel_flops / 1e6
    
    # Append the attributes to the data list
    data.append(attributes)

# Create a dataframe from the data list
df_kernel_flops = pd.DataFrame(data)

# Sort the dataframe by FLOPs in decreasing order
df_kernel_flops = df_kernel_flops.sort_values(by='FLOPs (GFLOPs)', ascending=False)

# Print the dataframe
df_kernel_flops

In [None]:
from prettytable import PrettyTable

# Create a PrettyTable object
table = PrettyTable()

# Add columns to the table
table.field_names = df_kernel_flops.columns.tolist()

# Add rows to the table
for index, row in df_kernel_flops.iterrows():
    table.add_row(row.tolist())

# Print the table
print(table)