In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/11.0.167"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
datadirs = ["/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla"]
outputdir = "/global/cfs/cdirs/m1759/yswang/results/conv2d-tf2-noxla"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["ID", "Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]
    resultkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]

    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    #metricdf.sort_values(by=selectkeys,inplace=True)
    #metricdf.reset_index(drop=True, inplace=True)

    #remove the calibration
    metricdf = metricdf[metricdf["Pass"] != "calibrate"]
    profiledf = pd.DataFrame(columns=selectkeys)
    
    ####### Get timing information
    ### CUDA Time
    # get cycles
    metricname = "CUDA Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "CUDA Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("CUDA Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9
    # manual merge and compute CUDA Time
    cyclesdf["CUDA Rates"] = list(ratesdf["CUDA Rates"])
    cyclesdf["CUDA Time"] = cyclesdf["CUDA Cycles"] / cyclesdf["CUDA Rates"]
    # merge with output
    profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()
    
    ### Combine
    del profiledf['ID']
    del metricdf['ID']
    profiledf['Invocations'] = 1
    profiledf = profiledf.groupby(resultkeys).sum().reset_index()
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    
    ### FP32 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_fadd_pred_on',
               'sm__sass_thread_inst_executed_op_ffma_pred_on',
               'sm__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP32 FLOPs"]], on=resultkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_hadd_pred_on',
               'sm__sass_thread_inst_executed_op_hfma_pred_on',
               'sm__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP16 FLOPs"]], on=resultkeys, how="inner")
    
    ### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"].str.contains("sm__inst_executed_pipe_tensor"), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "TC FLOPs"})
    tmpdf["TC FLOPs"] = 512 * tmpdf["TC FLOPs"]
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["TC FLOPs"]], on=resultkeys, how="inner")

    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP16 FLOPs"] + profiledf["TC FLOPs"] #+ metricdf["FP64 FLOPs"]
        
    ### FLOPs fractions
    #profiledf["FP64 FLOPs Fraction"] = profiledf["FP64 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP32 FLOPs Fraction"] = profiledf["FP32 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP16 FLOPs Fraction"] = profiledf["FP16 FLOPs"]/profiledf["FLOPs"]
    profiledf["TC FLOPs Fraction"]   = profiledf["TC FLOPs"]/profiledf["FLOPs"]
    profiledf = profiledf.fillna(0.)

    ####### Get number of bytes
    
    # adjust metric unit
    metricdf.loc[(metricdf["Metric Unit"]=="Kbyte"), ["Metric Value"]] *= 1e3
    metricdf.loc[(metricdf["Metric Unit"]=="Mbyte"), ["Metric Value"]] *= 1e6
    metricdf.loc[(metricdf["Metric Unit"]=="Gbyte"), ["Metric Value"]] *= 1e9
    
    ### L1 Bytes
    #project out
    l1df = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l1df = l1df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Bytes"})
    # merge
    profiledf = profiledf.merge(l1df[resultkeys+["L1 Bytes"]], on=resultkeys, how="inner")
    
    ### L2 Bytes
    #project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Bytes"})
    # merge
    profiledf = profiledf.merge(l2df[resultkeys+["L2 Bytes"]], on=resultkeys, how="inner")
    
    ### DRAM Bytes
    #project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__bytes"), selectkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Bytes"})
    # merge
    profiledf = profiledf.merge(dramdf[resultkeys+["DRAM Bytes"]], on=resultkeys, how="inner")
    
    ### Get performance
    profiledf["Performance GFlop/s"] = profiledf["FLOPs"] / (profiledf["CUDA Time"]*10**9)

    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP16 L1 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["TC L1 AI"]     = profiledf["TC FLOPs"]   / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP16 L2 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["TC L2 AI"]     = profiledf["TC FLOPs"]   / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP16 DRAM AI"] = profiledf["FP16 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["TC DRAM AI"]   = profiledf["TC FLOPs"]   / profiledf["DRAM Bytes"]
    
    ### Cleanup
    profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)

    return profiledf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#display(recorddf["prefix"])

In [6]:
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
xla_list = set([x.split(".")[-1] for x in recorddf["prefix"]])
if (len(xla_list) != 1):
    raise RuntimeError("too many xla options.")
xla = xla_list.pop()

all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])
all_passes = set([x.split(".")[0] for x in all_passes])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        
        #project frame
        files = recorddf.loc[ recorddf["prefix"] == pref + ".pass_" + pas + "." + xla, "file" ].values
        
        #project the invididual files
        metricfile = [x for x in files if x.endswith(".ncu-rep")][0]
        print(metricfile)
            
        #get the parameters from the filename
        parameters = parse_filename_nsight(os.path.basename(metricfile))
            
        #metrics
        metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
        for key in parameters:
            metricdf[key] = parameters[key]

        #fuse read/write metrics together:
        unique_metrics = metricdf["Metric Name"].unique()
        
        unique_metrics = set([x.replace(".sum","").replace(".per_second","").replace(".avg","") for x in unique_metrics])
        #add the metric type
        metricdf["Metric Type"] = "total"
        #rate
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
                
        for metric in unique_metrics:
            metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric
                
        #append to DF:
        df_metrics.append(metricdf)
    
    metricdf = pd.concat(df_metrics)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)
profiledf.reset_index(drop=True, inplace=True)

/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_9x9x64x64.stride_2.data_NHWC.fp32.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_9x9x64x64.stride_2.data_NHWC.fp32.pass_forward.noxla.ncu-rep


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_9x9x64x64.stride_2.data_NHWC.fp32.pass_calibrate.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x256.stride_2.data_NHWC.fp16.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x256.stride_2.data_NHWC.fp16.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_16.input_112x112x64.kernel_3x3x64x256.stride_2.data_NHWC.fp16.pass_calibrate.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_64.input_112x112x64.kernel_3x3x64x64.stride_2.data_NHWC.fp16.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/conv2d-tf2-noxla/tf2.name_ResNet50-2.batch_64.input_112x112x64.kernel_3x3x64x64.stride_2.data_NHWC.fp16.pass_forward.nox

In [7]:
display(profiledf)

Unnamed: 0,Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Name,CUDA Time,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
0,FP32,ResNet50-2,NHWC,112x112x64,9x9x64x64,2,16,backward,cudnn::gemm::computeOffsetsKernel(cudnn::gemm:...,0.000003,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
1,FP32,ResNet50-2,NHWC,112x112x64,9x9x64x64,2,16,backward,cudnn::gemm::computeWgradBOffsetsKernel(cudnn:...,0.000007,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
2,FP32,ResNet50-2,NHWC,112x112x64,9x9x64x64,2,16,backward,cudnn::gemm::computeWgradSplitKOffsetsKernel(c...,0.000007,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
3,FP32,ResNet50-2,NHWC,112x112x64,9x9x64x64,2,16,backward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.000003,...,0.000000,0.000000,0.000120,0.00012,0.000000,0.00000,0.000082,0.000082,0.000000,0.000000
4,FP32,ResNet50-2,NHWC,112x112x64,9x9x64x64,2,16,backward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.000045,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,forward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.000065,...,0.111116,0.000000,0.124952,0.00000,0.124952,0.00000,0.132999,0.000000,0.132999,0.000000
456,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,forward,void tensorflow::functor::FillPhiloxRandomKern...,0.000099,...,0.128112,0.000000,0.105621,0.00000,0.105621,0.00000,0.279743,0.000000,0.279743,0.000000
457,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,forward,void tensorflow::functor::PadInputCustomKernel...,0.000209,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
458,FP16,ResNet50-2,NHWC,112x112x64,7x7x64x64,2,16,forward,void tensorflow::functor::ShuffleInTensor3Simp...,0.000008,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction"] = combineddf["FP32 FLOPs"] / combineddf["FLOPs"]
combineddf["FP16 FLOPs Fraction"] = combineddf["FP16 FLOPs"] / combineddf["FLOPs"]
combineddf["TC FLOPs Fraction"]   = combineddf["TC FLOPs"]   / combineddf["FLOPs"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs"]      / (combineddf["CUDA Time"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs"]      / combineddf["L1 Bytes"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L1 Bytes"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L1 Bytes"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs"]   / combineddf["L1 Bytes"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs"]      / combineddf["L2 Bytes"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L2 Bytes"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L2 Bytes"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs"]   / combineddf["L2 Bytes"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs"]      / combineddf["DRAM Bytes"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs"]   / combineddf["DRAM Bytes"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [9]:
display(combineddf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,CUDA Time,Invocations,FP32 FLOPs,FP16 FLOPs,TC FLOPs,FLOPs,FP32 FLOPs Fraction,FP16 FLOPs Fraction,TC FLOPs Fraction,L1 Bytes,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,0.002754,30,120834000.0,53446660.0,64911050000.0,65085330000.0,0.001857,0.000821,0.997322,1650453000.0,...,0.032383,39.329235,40.352514,0.074916,0.033137,40.244461,80.289386,0.149061,0.065932,80.074393
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,0.000595,8,12845060.0,27148290.0,7398752000.0,7438746000.0,0.001727,0.00365,0.994624,403972100.0,...,0.067203,18.315007,18.2124,0.031449,0.066468,18.114484,30.406025,0.052504,0.110969,30.242552
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,0.003384,30,133826600.0,54396930.0,99623110000.0,99811330000.0,0.001341,0.000545,0.998114,3280693000.0,...,0.016581,30.366486,34.434355,0.046169,0.018767,34.369419,101.428463,0.135995,0.055278,101.23719
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,0.000685,8,25690110.0,27951100.0,14797500000.0,14851150000.0,0.00173,0.001882,0.996388,535112100.0,...,0.052234,27.653092,27.675143,0.047874,0.052087,27.575182,48.938653,0.084656,0.092107,48.76189
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,backward,0.005005,36,207882200.0,56297470.0,257228300000.0,257492500000.0,0.000807,0.000219,0.998974,4362263000.0,...,0.012906,58.966701,61.027476,0.04927,0.013343,60.964864,202.892314,0.163802,0.04436,202.684152
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,forward,0.000899,8,51380220.0,29556740.0,29595010000.0,29675950000.0,0.001731,0.000996,0.997273,680732100.0,...,0.043419,43.475265,43.656891,0.075586,0.043482,43.537823,71.278245,0.123409,0.070992,71.083844
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,backward,0.00168,27,92479490.0,27951100.0,73987520000.0,74107950000.0,0.001248,0.000377,0.998375,2164987000.0,...,0.012911,34.174586,34.670225,0.043265,0.013076,34.613883,158.71164,0.198057,0.059861,158.453723
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,forward,0.00049,7,25690110.0,27951100.0,14797500000.0,14851150000.0,0.00173,0.001882,0.996388,535638000.0,...,0.052183,27.625944,27.620078,0.047778,0.051983,27.520316,73.146423,0.126532,0.137668,72.882224
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,0.003303,29,7487498000.0,169388100.0,17356030000.0,25012920000.0,0.299345,0.006772,0.693883,3150196000.0,...,0.053771,5.509508,18.427881,5.516299,0.124794,12.786788,36.994493,11.074127,0.250528,25.669839
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,0.00055,7,6422528.0,26345470.0,3699376000.0,3732144000.0,0.001721,0.007059,0.99122,453900100.0,...,0.058042,8.150199,8.876985,0.015276,0.062663,8.799045,16.486916,0.028372,0.116382,16.342162


In [10]:
display(combineddf[["L2 AI", "L1 AI"]])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,40.352514,39.43483
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,18.2124,18.414008
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,34.434355,30.423859
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,27.675143,27.753335
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,backward,61.027476,59.027261
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,forward,43.656891,43.594162
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,backward,34.670225,34.230213
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,1,16,forward,27.620078,27.726089
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,18.427881,7.940114
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,8.876985,8.222391


# Export Data

In [11]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))