In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/11.0.167"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
datadirs = ["/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla"]
outputdir = "/global/cfs/cdirs/m1759/yswang/results/rnn1d-pt-noxla"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["ID", "Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass", "Name"]
    resultkeys = ["Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass", "Name"]
     #as metricdf use df_summary
    metricdf = df_metrics.copy()
    #metricdf.sort_values(by=selectkeys,inplace=True)
    #metricdf.reset_index(drop=True, inplace=True)

    #remove the calibration
    metricdf = metricdf[metricdf["Pass"] != "calibrate"]
    profiledf = pd.DataFrame(columns=selectkeys)
    
    ####### Get timing information
    ### CUDA Time
    # get cycles
    metricname = "CUDA Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "CUDA Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("CUDA Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/usecond"), ["CUDA Rates"]] *= 1e6
    # manual merge and compute CUDA Time
    cyclesdf["CUDA Rates"] = list(ratesdf["CUDA Rates"])
    cyclesdf["CUDA Time"] = cyclesdf["CUDA Cycles"] / cyclesdf["CUDA Rates"]
    # merge with output
    profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()
    
    ### Combine
    del profiledf['ID']
    del metricdf['ID']
    profiledf['Invocations'] = 1
    profiledf = profiledf.groupby(resultkeys).sum().reset_index()
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    
    ### FP32 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_fadd_pred_on',
               'sm__sass_thread_inst_executed_op_ffma_pred_on',
               'sm__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP32 FLOPs"]], on=resultkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_hadd_pred_on',
               'sm__sass_thread_inst_executed_op_hfma_pred_on',
               'sm__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP16 FLOPs"]], on=resultkeys, how="inner")
    
    ### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"].str.contains("sm__inst_executed_pipe_tensor"), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "TC FLOPs"})
    tmpdf["TC FLOPs"] = 512 * tmpdf["TC FLOPs"]
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["TC FLOPs"]], on=resultkeys, how="inner")

    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP16 FLOPs"] + profiledf["TC FLOPs"] #+ metricdf["FP64 FLOPs"]

    ### FLOPs fractions
    #profiledf["FP64 FLOPs Fraction"] = profiledf["FP64 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP32 FLOPs Fraction"] = profiledf["FP32 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP16 FLOPs Fraction"] = profiledf["FP16 FLOPs"]/profiledf["FLOPs"]
    profiledf["TC FLOPs Fraction"]   = profiledf["TC FLOPs"]/profiledf["FLOPs"]
    profiledf = profiledf.fillna(0.)

    ####### Get number of bytes
    
    # adjust metric unit
    metricdf.loc[(metricdf["Metric Unit"]=="Kbyte"), ["Metric Value"]] *= 1e3
    metricdf.loc[(metricdf["Metric Unit"]=="Mbyte"), ["Metric Value"]] *= 1e6
    metricdf.loc[(metricdf["Metric Unit"]=="Gbyte"), ["Metric Value"]] *= 1e9
    
    ### Shared transactions
    #project out
    shareddf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__data_pipe_lsu_wavefronts_mem_shared_op"), resultkeys+["Metric Value"] ].copy()
    shareddf = shareddf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Shared Transactions"})
    #add to timings
    profiledf = profiledf.merge(shareddf[resultkeys+["Shared Transactions"]], on=resultkeys, how="inner")
    
    ### L1 atomic transactions
    # project out
    metrics = ['l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom',
               'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']
    atomicdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    # get reads and writes
    atomicdf = atomicdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Atomic Transactions"})
    # add to timings
    profiledf = profiledf.merge(atomicdf[resultkeys+["L1 Atomic Transactions"]], on=resultkeys, how="inner")

    ### Local transactions 
    # project out
    localdf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_local_op"), resultkeys+["Metric Value"] ].copy()
    localdf = localdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Local Transactions"})
    # add to timings
    profiledf = profiledf.merge(localdf[resultkeys+["Local Transactions"]], on=resultkeys, how="inner")
    
    ### Global transactions 
    # project out
    globaldf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_global_op"), resultkeys+["Metric Value"] ].copy()
    globaldf = globaldf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "Global Transactions"})
    # add to timings
    profiledf = profiledf.merge(globaldf[resultkeys+["Global Transactions"]], on=resultkeys, how="inner")
    
    ### L1 Bytes
    profiledf["L1 Transactions"] = (profiledf["Shared Transactions"] + profiledf["L1 Atomic Transactions"]
                            + profiledf["Local Transactions"] + profiledf["Global Transactions"])
    profiledf["L1 Bytes"] = profiledf["L1 Transactions"] * 32

    
    ### L2 atomic & reduction
    metricdf.loc[(metricdf["Metric Name"].str.contains("lts__t_sectors_op")) & (metricdf["Metric Type"]=="total"), ["Metric Value"]] *= 2

    ### L2 transactions
    # project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_op"), resultkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Transactions"})
    l2df["L2 Bytes"] = l2df["L2 Transactions"] * 32
    # add to timings
    profiledf = profiledf.merge(l2df[resultkeys+["L2 Transactions", "L2 Bytes"]], on=resultkeys, how="inner")
    
    ### DRAM Bytes
    #project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__bytes"), selectkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Bytes"})
    # merge
    profiledf = profiledf.merge(dramdf[resultkeys+["DRAM Bytes"]], on=resultkeys, how="inner")
    
    ### Get performance
    profiledf["Performance GFlop/s"] = profiledf["FLOPs"] / (profiledf["CUDA Time"]*10**9)

    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP16 L1 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["TC L1 AI"]     = profiledf["TC FLOPs"]   / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP16 L2 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["TC L2 AI"]     = profiledf["TC FLOPs"]   / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP16 DRAM AI"] = profiledf["FP16 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["TC DRAM AI"]   = profiledf["TC FLOPs"]   / profiledf["DRAM Bytes"]
    
    ### Cleanup
    profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)

    return profiledf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#display(recorddf["prefix"])

In [6]:
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
xla_list = set([x.split(".")[-1] for x in recorddf["prefix"]])
if (len(xla_list) != 1):
    raise RuntimeError("too many xla options.")
xla = xla_list.pop()

all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])
all_passes = set([x.split(".")[0] for x in all_passes])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        if pas == 'calibrate':
            continue
        
        #project frame
        files = recorddf.loc[ recorddf["prefix"] == pref + ".pass_" + pas + "." + xla, "file" ].values
        
        #project the invididual files
        metricfile = [x for x in files if x.endswith(".ncu-rep")][0]
        print(metricfile)
            
        #get the parameters from the filename
        parameters = parse_filename_nsight(os.path.basename(metricfile))
        #print(parameters)
            
        #metrics
        metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
        for key in parameters:
            metricdf[key] = parameters[key]

        #fuse read/write metrics together:
        unique_metrics = metricdf["Metric Name"].unique()
        
        unique_metrics = set([x.replace(".sum","").replace(".per_second","").replace(".avg","") for x in unique_metrics])
        #add the metric type
        metricdf["Metric Type"] = "total"
        #rate
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
        #print(unique_metrics)
        unique_units = metricdf["Metric Unit"].unique()
        print(unique_units)
        
        for metric in unique_metrics:
            metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric
                
        #append to DF:
        df_metrics.append(metricdf)
    
    metricdf = pd.concat(df_metrics)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)
profiledf.reset_index(drop=True, inplace=True)

/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_16x16x64.nneu_16.pass_backward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte'
 'cycle/nsecond']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_16x16x64.nneu_16.pass_forward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte']


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_64x16x32.nneu_16.pass_backward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte'
 'cycle/nsecond']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_64x16x32.nneu_16.pass_forward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_16x16x32.nneu_128.pass_backward..ncu-rep
['Kbyte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'byte'
 'cycle/nsecond']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_16x16x32.nneu_128.pass_forward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-pt-noxla/pt.fp_16.celltype_lstm.input_128x16x32.nneu_16.pass_backward..ncu-rep
['byte' nan 'sector' 'cycle' 'cycle/usecond' 'inst' 'Kbyte'
 'cycle/nsecond']
/global/cfs/cdirs/m1759/yswang/data/rnn1d-p

In [7]:
display(profiledf)

Unnamed: 0,Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Name,CUDA Time,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
0,FP16,lstm,16x16x64,16,16,64,16,backward,"void GENERIC_elementWise_bp2<__half, __half, f...",0.000007,...,0.349831,0.0,0.534826,0.320896,0.213930,0.0,1.245249,0.747149,0.498100,0.0
1,FP16,lstm,16x16x64,16,16,64,16,backward,"void LSTM_elementWise_bp1<__half, __half, floa...",0.000080,...,0.333333,0.0,0.663059,0.506196,0.156863,0.0,2.309122,1.762843,0.546279,0.0
2,FP16,lstm,16x16x64,16,16,64,16,backward,"void LSTM_elementWise_fp<__half, __half, float...",0.000081,...,0.447368,0.0,0.762160,0.518432,0.243728,0.0,2.777372,1.889209,0.888163,0.0
3,FP16,lstm,16x16x64,16,16,64,16,backward,void at::native::legacy::elementwise_kernel<12...,0.000018,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
4,FP16,lstm,16x16x64,16,16,64,16,backward,void at::native::legacy::elementwise_kernel<12...,0.000003,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,FP32,lstm,16x16x64,16,16,64,16,forward,void at::native::legacy::elementwise_kernel<12...,0.000006,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
495,FP32,lstm,16x16x64,16,16,64,16,forward,void at::native::modern::vectorized_elementwis...,0.000003,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
496,FP32,lstm,16x16x64,16,16,64,16,forward,"void gemmSN_NN_kernel<float, 256, 4, 2, 8, 4, ...",0.000097,...,0.000000,0.0,0.836218,0.836218,0.000000,0.0,7.759073,7.759073,0.000000,0.0
497,FP32,lstm,16x16x64,16,16,64,16,forward,void transpose_readWrite_alignment_kernel<floa...,0.000012,...,0.000000,0.0,0.065306,0.065306,0.000000,0.0,0.297242,0.297242,0.000000,0.0


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction"] = combineddf["FP32 FLOPs"] / combineddf["FLOPs"]
combineddf["FP16 FLOPs Fraction"] = combineddf["FP16 FLOPs"] / combineddf["FLOPs"]
combineddf["TC FLOPs Fraction"]   = combineddf["TC FLOPs"]   / combineddf["FLOPs"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs"]      / (combineddf["CUDA Time"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs"]      / combineddf["L1 Bytes"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L1 Bytes"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L1 Bytes"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs"]   / combineddf["L1 Bytes"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs"]      / combineddf["L2 Bytes"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L2 Bytes"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L2 Bytes"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs"]   / combineddf["L2 Bytes"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs"]      / combineddf["DRAM Bytes"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs"]   / combineddf["DRAM Bytes"]

combineddf['Framework'] = 'PT'
combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [9]:
display(combineddf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,CUDA Time,Invocations,FP32 FLOPs,FP16 FLOPs,TC FLOPs,FLOPs,FP32 FLOPs Fraction,FP16 FLOPs Fraction,TC FLOPs Fraction,Shared Transactions,...,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI,Framework
Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,lstm,128x16x32,128,16,32,16,backward,0.000785,108,3288459.0,1076864.0,87031808.0,91397131.0,0.03598,0.011782,0.952238,178344.0,...,6.955565,6.760303,0.243235,0.079652,6.437416,23.557577,0.847599,0.277561,22.432416,PT
FP16,lstm,128x16x32,128,16,32,16,forward,0.000327,42,1759809.0,557056.0,16777216.0,19094081.0,0.092165,0.029174,0.878661,45312.0,...,4.325238,4.131401,0.380771,0.120531,3.630099,19.312545,1.779944,0.56343,16.969171,PT
FP16,lstm,16x128x32,16,128,32,16,backward,0.003466,572,16096792.0,2256512.0,60555264.0,78908568.0,0.203993,0.028597,0.767411,1100144.0,...,1.324045,3.474018,0.708675,0.099345,2.665998,17.689276,3.608488,0.505852,13.574936,PT
FP16,lstm,16x128x32,16,128,32,16,forward,0.001794,274,10158980.0,1081344.0,8388608.0,19628932.0,0.517551,0.055089,0.427359,891008.0,...,0.248993,1.764461,0.913199,0.097203,0.754059,15.201284,7.867445,0.837428,6.496411,PT
FP16,lstm,16x16x128,16,16,128,16,backward,0.000549,95,2150463.0,355968.0,14942208.0,17448639.0,0.123245,0.020401,0.856354,149402.0,...,2.115089,3.56768,0.4397,0.072784,3.055196,16.908677,2.083915,0.344952,14.47981,PT
FP16,lstm,16x16x128,16,16,128,16,forward,0.000236,36,1257094.0,135168.0,4194304.0,5586566.0,0.225021,0.024195,0.750784,115224.0,...,0.925728,3.107294,0.699206,0.075182,2.332906,19.638645,4.419105,0.475161,14.744379,PT
FP16,lstm,16x16x32,16,16,32,16,backward,0.000544,95,2048774.0,309888.0,7602176.0,9960838.0,0.205683,0.031111,0.763206,137325.0,...,1.321276,3.283366,0.675332,0.102148,2.505886,17.254359,3.548927,0.536794,13.168639,PT
FP16,lstm,16x16x32,16,16,32,16,forward,0.000232,36,1268868.0,135168.0,1048576.0,2452612.0,0.517354,0.055112,0.427534,111376.0,...,0.248967,1.746039,0.90332,0.096227,0.746492,15.177242,7.852003,0.836446,6.488793,PT
FP16,lstm,16x16x32,16,16,32,32,backward,0.000526,97,5370056.0,919808.0,15204352.0,21494216.0,0.249837,0.042793,0.707369,134579.0,...,2.104635,3.617288,0.903733,0.154796,2.558759,17.341905,4.332654,0.742117,12.267134,PT
FP16,lstm,16x16x32,16,16,32,32,forward,0.000204,38,2764602.0,417792.0,2097152.0,5279546.0,0.523644,0.079134,0.397222,77472.0,...,0.565433,2.232797,1.16919,0.17669,0.886916,14.237567,7.455415,1.126677,5.655475,PT


In [10]:
display(combineddf[["L2 AI", "L1 AI"]])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,lstm,128x16x32,128,16,32,16,backward,6.760303,7.30444
FP16,lstm,128x16x32,128,16,32,16,forward,4.131401,4.922535
FP16,lstm,16x128x32,16,128,32,16,backward,3.474018,1.725342
FP16,lstm,16x128x32,16,128,32,16,forward,1.764461,0.582632
FP16,lstm,16x16x128,16,16,128,16,backward,3.56768,2.469878
FP16,lstm,16x16x128,16,16,128,16,forward,3.107294,1.233015
FP16,lstm,16x16x32,16,16,32,16,backward,3.283366,1.731216
FP16,lstm,16x16x32,16,16,32,16,forward,1.746039,0.582331
FP16,lstm,16x16x32,16,16,32,32,backward,3.617288,2.975298
FP16,lstm,16x16x32,16,16,32,32,forward,2.232797,1.42347


# Export Data

In [11]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))