In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/11.0.167"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
datadirs = ["/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla"]
outputdir = "/global/cfs/cdirs/m1759/yswang/results/conv2d-tf1-noxla"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["ID", "Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]
    resultkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]

    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    #metricdf.sort_values(by=selectkeys,inplace=True)
    #metricdf.reset_index(drop=True, inplace=True)

    #remove the calibration
    metricdf = metricdf[metricdf["Pass"] != "calibrate"]
    profiledf = pd.DataFrame(columns=selectkeys)
    
    ####### Get timing information
    ### CUDA Time
    # get cycles
    metricname = "CUDA Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "CUDA Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("CUDA Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9
    # manual merge and compute CUDA Time
    cyclesdf["CUDA Rates"] = list(ratesdf["CUDA Rates"])
    cyclesdf["CUDA Time"] = cyclesdf["CUDA Cycles"] / cyclesdf["CUDA Rates"]
    # merge with output
    profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()
    
    ### Combine
    del profiledf['ID']
    del metricdf['ID']
    profiledf['Invocations'] = 1
    profiledf = profiledf.groupby(resultkeys).sum().reset_index()
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    
    ### FP32 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_fadd_pred_on',
               'sm__sass_thread_inst_executed_op_ffma_pred_on',
               'sm__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP32 FLOPs"]], on=resultkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_hadd_pred_on',
               'sm__sass_thread_inst_executed_op_hfma_pred_on',
               'sm__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP16 FLOPs"]], on=resultkeys, how="inner")
    
    ### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"].str.contains("sm__inst_executed_pipe_tensor"), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "TC FLOPs"})
    tmpdf["TC FLOPs"] = 512 * tmpdf["TC FLOPs"]
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["TC FLOPs"]], on=resultkeys, how="inner")

    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP16 FLOPs"] + profiledf["TC FLOPs"] #+ metricdf["FP64 FLOPs"]
        
    ### FLOPs fractions
    #profiledf["FP64 FLOPs Fraction"] = profiledf["FP64 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP32 FLOPs Fraction"] = profiledf["FP32 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP16 FLOPs Fraction"] = profiledf["FP16 FLOPs"]/profiledf["FLOPs"]
    profiledf["TC FLOPs Fraction"]   = profiledf["TC FLOPs"]/profiledf["FLOPs"]
    profiledf = profiledf.fillna(0.)

    ####### Get number of bytes
    
    # adjust metric unit
    metricdf.loc[(metricdf["Metric Unit"]=="Kbyte"), ["Metric Value"]] *= 1e3
    metricdf.loc[(metricdf["Metric Unit"]=="Mbyte"), ["Metric Value"]] *= 1e6
    metricdf.loc[(metricdf["Metric Unit"]=="Gbyte"), ["Metric Value"]] *= 1e9
    
    ### L1 Bytes
    #project out
    l1df = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l1df = l1df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Bytes"})
    # merge
    profiledf = profiledf.merge(l1df[resultkeys+["L1 Bytes"]], on=resultkeys, how="inner")
    
    ### L2 Bytes
    #project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Bytes"})
    # merge
    profiledf = profiledf.merge(l2df[resultkeys+["L2 Bytes"]], on=resultkeys, how="inner")
    
    ### DRAM Bytes
    #project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__bytes"), selectkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Bytes"})
    # merge
    profiledf = profiledf.merge(dramdf[resultkeys+["DRAM Bytes"]], on=resultkeys, how="inner")
    
    ### Get performance
    profiledf["Performance GFlop/s"] = profiledf["FLOPs"] / (profiledf["CUDA Time"]*10**9)

    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP16 L1 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["TC L1 AI"]     = profiledf["TC FLOPs"]   / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP16 L2 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["TC L2 AI"]     = profiledf["TC FLOPs"]   / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP16 DRAM AI"] = profiledf["FP16 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["TC DRAM AI"]   = profiledf["TC FLOPs"]   / profiledf["DRAM Bytes"]
    
    ### Cleanup
    profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)

    return profiledf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#display(recorddf["prefix"])

In [6]:
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
xla_list = set([x.split(".")[-1] for x in recorddf["prefix"]])
if (len(xla_list) != 1):
    raise RuntimeError("too many xla options.")
xla = xla_list.pop()

all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])
all_passes = set([x.split(".")[0] for x in all_passes])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        
        #project frame
        files = recorddf.loc[ recorddf["prefix"] == pref + ".pass_" + pas + "." + xla, "file" ].values
        
        #project the invididual files
        metricfile = [x for x in files if x.endswith(".ncu-rep")][0]
        print(metricfile)
            
        #get the parameters from the filename
        parameters = parse_filename_nsight(os.path.basename(metricfile))
            
        #metrics
        metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
        for key in parameters:
            metricdf[key] = parameters[key]

        #fuse read/write metrics together:
        unique_metrics = metricdf["Metric Name"].unique()
        
        unique_metrics = set([x.replace(".sum","").replace(".per_second","").replace(".avg","") for x in unique_metrics])
        #add the metric type
        metricdf["Metric Type"] = "total"
        #rate
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
                
        for metric in unique_metrics:
            metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric
                
        #append to DF:
        df_metrics.append(metricdf)
    
    metricdf = pd.concat(df_metrics)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)
profiledf.reset_index(drop=True, inplace=True)

/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_7x7x64x64.stride_2.data_NHWC.fp16.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_7x7x64x64.stride_2.data_NHWC.fp16.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_7x7x64x64.stride_2.data_NHWC.fp16.pass_calibrate.noxla.ncu-rep


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp16.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp16.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp16.pass_calibrate.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp32.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp32.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf1-noxla/tf1.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x128.stride_2.data_NHWC.fp32.pass_calibrate.

In [7]:
display(profiledf)

Unnamed: 0,Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Name,CUDA Time,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
0,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,backward,cudnn::gemm::computeWgradOffsetsKernel(cudnn::...,0.000004,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
1,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,backward,"void nchwToNhwcKernel<__half, __half, float, t...",0.000093,...,0.210314,0.000000,0.420140,0.210070,0.210070,0.000000,0.545985,0.272992,0.272992,0.00000
2,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,backward,"void nhwcToNchwKernel<float, __half, float, tr...",0.000007,...,0.000000,0.000000,0.132917,0.132917,0.000000,0.000000,0.246502,0.246502,0.000000,0.00000
3,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,backward,"void scalePackedTensor_kernel<float, float>(cu...",0.000004,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
4,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,backward,void tensorflow::functor::FillPhiloxRandomKern...,0.000100,...,0.128112,0.000000,0.105844,0.000000,0.105844,0.000000,0.283741,0.000000,0.283741,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,64,backward,void tensorflow::functor::SwapDimension1And2In...,0.000286,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
211,FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,64,forward,Volta_hmma_implicit_gemm_fprop_fp32_nhwc_64x32...,0.000364,...,0.000000,20.571518,25.925100,0.044931,0.000000,25.880169,113.840678,0.197298,0.000000,113.64338
212,FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,64,forward,void tensorflow::functor::FillPhiloxRandomKern...,0.000380,...,0.125793,0.000000,0.102537,0.000000,0.102537,0.000000,0.258781,0.000000,0.258781,0.00000
213,FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,64,forward,void tensorflow::functor::PadInputCustomKernel...,0.000824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction"] = combineddf["FP32 FLOPs"] / combineddf["FLOPs"]
combineddf["FP16 FLOPs Fraction"] = combineddf["FP16 FLOPs"] / combineddf["FLOPs"]
combineddf["TC FLOPs Fraction"]   = combineddf["TC FLOPs"]   / combineddf["FLOPs"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs"]      / (combineddf["CUDA Time"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs"]      / combineddf["L1 Bytes"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L1 Bytes"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L1 Bytes"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs"]   / combineddf["L1 Bytes"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs"]      / combineddf["L2 Bytes"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L2 Bytes"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L2 Bytes"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs"]   / combineddf["L2 Bytes"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs"]      / combineddf["DRAM Bytes"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs"]   / combineddf["DRAM Bytes"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [9]:
display(combineddf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,CUDA Time,Invocations,FP32 FLOPs,FP16 FLOPs,TC FLOPs,FLOPs,FP32 FLOPs Fraction,FP16 FLOPs Fraction,TC FLOPs Fraction,L1 Bytes,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,0.000837,8,33580030.0,33072128.0,15099490000.0,15166150000.0,0.002214,0.002181,0.995605,527100700.0,...,0.062743,28.64632,28.106267,0.062231,0.06129,27.982745,51.282346,0.113546,0.111829,51.05697
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,0.000463,5,12845060.0,14303232.0,7398752000.0,7425901000.0,0.00173,0.001926,0.996344,288372000.0,...,0.0496,25.65697,24.388755,0.042187,0.046976,24.299592,50.53192,0.087408,0.097331,50.34718
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,0.000964,8,40076290.0,39568384.0,15099490000.0,15179140000.0,0.00264,0.002607,0.994753,1074970000.0,...,0.036809,14.046433,16.669199,0.04401,0.043453,16.581736,44.912249,0.118578,0.117075,44.676595
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,0.000555,5,25690110.0,15106048.0,14797500000.0,14838300000.0,0.001731,0.001018,0.997251,419502000.0,...,0.036009,35.273977,34.255104,0.059307,0.034873,34.160923,72.597228,0.12569,0.073907,72.397631
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,backward,0.001337,10,76809220.0,52265984.0,59190020000.0,59319090000.0,0.001295,0.000881,0.997824,1137870000.0,...,0.045933,52.018255,51.651221,0.06688,0.04551,51.538831,149.557889,0.193655,0.131775,149.232459
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,forward,0.000771,5,51380220.0,16711680.0,29595010000.0,29663100000.0,0.001732,0.000563,0.997704,565112000.0,...,0.029572,52.370164,51.451987,0.089121,0.028987,51.333878,93.020626,0.161123,0.052406,92.807096
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,backward,0.001919,7,7721308000.0,44408832.0,0.0,7765717000.0,0.994281,0.005719,0.0,749843600.0,...,0.059224,0.0,10.191698,10.133416,0.058282,0.0,12.85295,12.779449,0.073501,0.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,forward,0.003672,3,14810350000.0,241967104.0,0.0,15052320000.0,0.983925,0.016075,0.0,9326630000.0,...,0.025944,0.0,9.914347,9.754973,0.159374,0.0,95.9924,94.449315,1.543085,0.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,0.001128,5,3700590000.0,71910288.0,0.0,3772501000.0,0.980938,0.019062,0.0,1189290000.0,...,0.060465,0.0,10.057701,9.865984,0.191717,0.0,20.312033,19.924851,0.387182,0.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,0.000419,4,6422528.0,13500416.0,3699376000.0,3719299000.0,0.001727,0.00363,0.994643,338300000.0,...,0.039907,10.935194,11.679903,0.020169,0.042396,11.617338,28.850253,0.049819,0.104721,28.695713


In [10]:
display(combineddf[["L2 AI", "L1 AI"]])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,28.106267,28.77277
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,24.388755,25.751113
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,16.669199,14.120523
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,34.255104,35.371226
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,backward,51.651221,52.13169
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,forward,51.451987,52.490657
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,backward,10.191698,10.35645
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,forward,9.914347,1.613907
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,10.057701,3.172061
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,11.679903,10.994085


# Export Data

In [11]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))