In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/11.0.167"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
datadirs = ["/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla"]
outputdir = "/global/cfs/cdirs/m1759/yswang/results/rnn1d-tf2-noxla"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["ID", "Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass", "Name"]
    resultkeys = ["Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass", "Name"]
                                    
    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    #metricdf.sort_values(by=selectkeys,inplace=True)
    #metricdf.reset_index(drop=True, inplace=True)

    #remove the calibration
    metricdf = metricdf[metricdf["Pass"] != "calibrate"]
    profiledf = pd.DataFrame(columns=selectkeys)
    
    ####### Get timing information
    ### CUDA Time
    # get cycles
    metricname = "CUDA Cycles"
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="total"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # get rates
    metricname = "CUDA Rates"
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="sm__cycles_elapsed") & (metricdf["Metric Type"]=="rate"),
                           selectkeys+["Metric Unit", "Metric Value"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={"Metric Value": metricname}).copy()
    # check consistency
    if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):
        raise ValueError("CUDA Time data not consistent")
    # adjust metric unit
    ratesdf.loc[ratesdf["Metric Unit"].str.contains("cycle/nsecond"), ["CUDA Rates"]] *= 1e9
    # manual merge and compute CUDA Time
    cyclesdf["CUDA Rates"] = list(ratesdf["CUDA Rates"])
    cyclesdf["CUDA Time"] = cyclesdf["CUDA Cycles"] / cyclesdf["CUDA Rates"]
    # merge with output
    profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()
    
    ### Combine
    del profiledf['ID']
    del metricdf['ID']
    profiledf['Invocations'] = 1
    profiledf = profiledf.groupby(resultkeys).sum().reset_index()
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    
    ### FP32 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_fadd_pred_on',
               'sm__sass_thread_inst_executed_op_ffma_pred_on',
               'sm__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP32 FLOPs"]], on=resultkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['sm__sass_thread_inst_executed_op_hadd_pred_on',
               'sm__sass_thread_inst_executed_op_hfma_pred_on',
               'sm__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs"})
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["FP16 FLOPs"]], on=resultkeys, how="inner")
    
    ### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"].str.contains("sm__inst_executed_pipe_tensor"), resultkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "TC FLOPs"})
    tmpdf["TC FLOPs"] = 512 * tmpdf["TC FLOPs"]
    # merge
    profiledf = profiledf.merge(tmpdf[resultkeys+["TC FLOPs"]], on=resultkeys, how="inner")

    ### Total FLOPs
    profiledf["FLOPs"] = profiledf["FP32 FLOPs"] + profiledf["FP16 FLOPs"] + profiledf["TC FLOPs"] #+ metricdf["FP64 FLOPs"]
        
    ### FLOPs fractions
    #profiledf["FP64 FLOPs Fraction"] = profiledf["FP64 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP32 FLOPs Fraction"] = profiledf["FP32 FLOPs"]/profiledf["FLOPs"]
    profiledf["FP16 FLOPs Fraction"] = profiledf["FP16 FLOPs"]/profiledf["FLOPs"]
    profiledf["TC FLOPs Fraction"]   = profiledf["TC FLOPs"]/profiledf["FLOPs"]
    profiledf = profiledf.fillna(0.)

    ####### Get number of bytes
    
    # adjust metric unit
    metricdf.loc[(metricdf["Metric Unit"]=="Kbyte"), ["Metric Value"]] *= 1e3
    metricdf.loc[(metricdf["Metric Unit"]=="Mbyte"), ["Metric Value"]] *= 1e6
    metricdf.loc[(metricdf["Metric Unit"]=="Gbyte"), ["Metric Value"]] *= 1e9
    
    ### L1 Bytes
    #project out
    l1df = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l1df = l1df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Bytes"})
    # merge
    profiledf = profiledf.merge(l1df[resultkeys+["L1 Bytes"]], on=resultkeys, how="inner")
    
    ### L2 Bytes
    #project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_bytes"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Bytes"})
    # merge
    profiledf = profiledf.merge(l2df[resultkeys+["L2 Bytes"]], on=resultkeys, how="inner")
    
    ### DRAM Bytes
    #project out
    dramdf = metricdf.loc[metricdf["Metric Name"].str.contains("dram__bytes"), selectkeys+["Metric Value"] ].copy()
    dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={"Metric Value": "DRAM Bytes"})
    # merge
    profiledf = profiledf.merge(dramdf[resultkeys+["DRAM Bytes"]], on=resultkeys, how="inner")
    
    ### Get performance
    profiledf["Performance GFlop/s"] = profiledf["FLOPs"] / (profiledf["CUDA Time"]*10**9)

    ### Get AI
    # L1
    profiledf["L1 AI"]        = profiledf["FLOPs"]      / profiledf["L1 Bytes"]
    profiledf["FP32 L1 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["FP16 L1 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L1 Bytes"]
    profiledf["TC L1 AI"]     = profiledf["TC FLOPs"]   / profiledf["L1 Bytes"]
    # L2
    profiledf["L2 AI"]        = profiledf["FLOPs"]      / profiledf["L2 Bytes"]
    profiledf["FP32 L2 AI"]   = profiledf["FP32 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["FP16 L2 AI"]   = profiledf["FP16 FLOPs"] / profiledf["L2 Bytes"]
    profiledf["TC L2 AI"]     = profiledf["TC FLOPs"]   / profiledf["L2 Bytes"]
    # DRAM
    profiledf["DRAM AI"]      = profiledf["FLOPs"]      / profiledf["DRAM Bytes"]
    profiledf["FP32 DRAM AI"] = profiledf["FP32 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["FP16 DRAM AI"] = profiledf["FP16 FLOPs"] / profiledf["DRAM Bytes"]
    profiledf["TC DRAM AI"]   = profiledf["TC FLOPs"]   / profiledf["DRAM Bytes"]
    
    ### Cleanup
    profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)

    return profiledf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#display(recorddf["prefix"])

In [6]:
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
xla_list = set([x.split(".")[-1] for x in recorddf["prefix"]])
if (len(xla_list) != 1):
    raise RuntimeError("too many xla options.")
xla = xla_list.pop()

all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])
all_passes = set([x.split(".")[0] for x in all_passes])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        if pas == 'calibrate':
            continue
        
        #project frame
        files = recorddf.loc[ recorddf["prefix"] == pref + ".pass_" + pas + "." + xla, "file" ].values
        
        #project the invididual files
        metricfile = [x for x in files if x.endswith(".ncu-rep")][0]
        print(metricfile)
            
        #get the parameters from the filename
        parameters = parse_filename_nsight(os.path.basename(metricfile))
        #print(parameters)
            
        #metrics
        metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
        for key in parameters:
            metricdf[key] = parameters[key]

        #fuse read/write metrics together:
        unique_metrics = metricdf["Metric Name"].unique()
        
        unique_metrics = set([x.replace(".sum","").replace(".per_second","").replace(".avg","") for x in unique_metrics])
        #add the metric type
        metricdf["Metric Type"] = "total"
        #rate
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
                
        for metric in unique_metrics:
            metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric
                
        #append to DF:
        df_metrics.append(metricdf)
    
    metricdf = pd.concat(df_metrics)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)
profiledf.reset_index(drop=True, inplace=True)

/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x64x64.nneu_64.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x64x64.nneu_64.pass_forward.noxla.ncu-rep


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x32.nneu_32.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x32.nneu_32.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x128.nneu_128.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x128.nneu_128.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x128.nneu_64.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x32x128.nneu_64.pass_forward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x128x64.nneu_128.pass_backward.noxla.ncu-rep
/global/cfs/cdirs/m1759/yswang/data/rnn1d-tf2-noxla/tf2.fp_16.celltype_lstm.input_32x128x64.nneu_128.pass_

In [7]:
display(profiledf)

Unnamed: 0,Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Name,CUDA Time,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
0,FP16,lstm,32x64x64,32,64,64,64,backward,_ZN10tensorflow87_GLOBAL__N__63_tmpxft_000027e...,0.000403,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
1,FP16,lstm,32x64x64,32,64,64,64,backward,_ZN10tensorflow93_GLOBAL__N__69_tmpxft_0000243...,0.000034,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
2,FP16,lstm,32x64x64,32,64,64,64,backward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.000006,...,0.010417,0.0,0.000287,0.000000,0.000287,0.0,0.000372,0.000000,0.000372,0.0
3,FP16,lstm,32x64x64,32,64,64,64,backward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.001865,...,0.083320,0.0,0.058603,0.000000,0.058603,0.0,0.095226,0.000000,0.095226,0.0
4,FP16,lstm,32x64x64,32,64,64,64,backward,void Eigen::internal::EigenMetaKernel<Eigen::T...,0.000625,...,0.249959,0.0,0.175777,0.000000,0.175777,0.0,0.285536,0.000000,0.285536,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038,FP32,lstm,64x32x32,64,32,32,16,forward,void tensorflow::functor::FillPhiloxRandomKern...,0.000009,...,0.000000,0.0,7.564324,7.564324,0.000000,0.0,87.454472,87.454472,0.000000,0.0
2039,FP32,lstm,64x32x32,64,32,32,16,forward,void tensorflow::functor::FillPhiloxRandomKern...,0.000014,...,0.000000,0.0,0.247008,0.247008,0.000000,0.0,2.623143,2.623143,0.000000,0.0
2040,FP32,lstm,64x32x32,64,32,32,16,forward,void tensorflow::functor::SwapDimension1And2In...,0.000006,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
2041,FP32,lstm,64x32x32,64,32,32,16,forward,volta_sgemm_32x128_nn,0.000289,...,0.000000,0.0,5.220857,5.220857,0.000000,0.0,13.920494,13.920494,0.000000,0.0


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Cell", "Input Shape", "Batch Size", "Time Steps", "Features", "Hidden Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction"] = combineddf["FP32 FLOPs"] / combineddf["FLOPs"]
combineddf["FP16 FLOPs Fraction"] = combineddf["FP16 FLOPs"] / combineddf["FLOPs"]
combineddf["TC FLOPs Fraction"]   = combineddf["TC FLOPs"]   / combineddf["FLOPs"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs"]      / (combineddf["CUDA Time"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs"]      / combineddf["L1 Bytes"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L1 Bytes"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L1 Bytes"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs"]   / combineddf["L1 Bytes"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs"]      / combineddf["L2 Bytes"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs"] / combineddf["L2 Bytes"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs"] / combineddf["L2 Bytes"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs"]   / combineddf["L2 Bytes"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs"]      / combineddf["DRAM Bytes"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs"] / combineddf["DRAM Bytes"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs"]   / combineddf["DRAM Bytes"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [9]:
display(combineddf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,CUDA Time,Invocations,FP32 FLOPs,FP16 FLOPs,TC FLOPs,FLOPs,FP32 FLOPs Fraction,FP16 FLOPs Fraction,TC FLOPs Fraction,L1 Bytes,...,FP16 L1 AI,TC L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI
Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,lstm,128x32x32,128,32,32,16,backward,0.005462,1192,28344820.0,4792128.0,167772160.0,200909108.0,0.141083,0.023852,0.835065,70376322.0,...,0.068093,2.383929,2.524666,0.356187,0.060219,2.108260,7.210730,1.017310,0.171992,6.021428
FP16,lstm,128x32x32,128,32,32,16,forward,0.002112,488,25505693.0,3674114.0,34603008.0,63782815.0,0.399883,0.057604,0.542513,48170302.0,...,0.076273,0.718347,1.241076,0.496286,0.071490,0.673300,6.330072,2.531291,0.364634,3.434146
FP16,lstm,256x32x32,256,32,32,16,backward,0.005993,1256,36827054.0,8739371.0,352321536.0,397887961.0,0.092556,0.021964,0.885479,127518082.0,...,0.068534,2.762914,2.756015,0.255087,0.060534,2.440394,7.995908,0.740072,0.175625,7.080210
FP16,lstm,256x32x32,256,32,32,16,forward,0.002082,488,30687246.0,6031362.0,69206016.0,105924624.0,0.289708,0.056940,0.653352,88348812.0,...,0.068268,0.783327,1.147712,0.332502,0.065351,0.749860,5.117737,1.482651,0.291405,3.343682
FP16,lstm,32x128x128,32,128,128,16,backward,0.020738,4648,31050434.0,5470663.0,470548480.0,507069577.0,0.061235,0.010789,0.927976,93261752.0,...,0.058659,5.045460,4.209550,0.257772,0.045416,3.906362,8.797738,0.538730,0.094917,8.164091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FP32,lstm,256x32x32,256,32,32,16,forward,0.002358,489,88783264.0,0.0,0.0,88783264.0,1.000000,0.000000,0.000000,112357580.0,...,0.000000,0.000000,0.765648,0.765648,0.000000,0.000000,2.716541,2.716541,0.000000,0.000000
FP32,lstm,32x32x32,32,32,32,16,backward,0.005308,1194,51184182.0,0.0,0.0,51184182.0,1.000000,0.000000,0.000000,30037370.0,...,0.000000,0.000000,1.349313,1.349313,0.000000,0.000000,3.708233,3.708233,0.000000,0.000000
FP32,lstm,32x32x32,32,32,32,16,forward,0.002085,489,32290274.0,0.0,0.0,32290274.0,1.000000,0.000000,0.000000,21028750.0,...,0.000000,0.000000,1.354980,1.354980,0.000000,0.000000,5.381730,5.381730,0.000000,0.000000
FP32,lstm,64x32x32,64,32,32,16,backward,0.005512,1194,79781799.0,0.0,0.0,79781799.0,1.000000,0.000000,0.000000,51723310.0,...,0.000000,0.000000,1.289835,1.289835,0.000000,0.000000,3.568048,3.568048,0.000000,0.000000


In [10]:
display(combineddf[["L2 AI", "L1 AI"]])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Cell,Input Shape,Batch Size,Time Steps,Features,Hidden Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,lstm,128x32x32,128,32,32,16,backward,2.524666,2.854783
FP16,lstm,128x32x32,128,32,32,16,forward,1.241076,1.324111
FP16,lstm,256x32x32,256,32,32,16,backward,2.756015,3.120247
FP16,lstm,256x32x32,256,32,32,16,forward,1.147712,1.198937
FP16,lstm,32x128x128,32,128,128,16,backward,4.209550,5.437058
...,...,...,...,...,...,...,...,...,...
FP32,lstm,256x32x32,256,32,32,16,forward,0.765648,0.790185
FP32,lstm,32x32x32,32,32,32,16,backward,1.349313,1.704017
FP32,lstm,32x32x32,32,32,32,16,forward,1.354980,1.535530
FP32,lstm,64x32x32,64,32,32,16,backward,1.289835,1.542473


# Export Data

In [11]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))