## no profiler + conc kernel

In [38]:
import pandas as pd
import numpy as np
import os
from hta.trace_analysis import TraceAnalysis

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

path="trans_to_csv/trace_no_profiler_con_res"
type_name=["conc_kernel","cuda_runtime","cuda_driver","gpu_memcpy","gpu_memset"]
file_path=[os.path.join(path,for_type)+'.csv' for for_type in type_name]
kernel=file_path[0]
runtime=file_path[1]
driver=file_path[2]
memcpy=file_path[3]
memset=file_path[4]


df_kernel=pd.read_csv(kernel)
df_runtime=pd.read_csv(runtime)
df_driver=pd.read_csv(driver)
df_memcpy=pd.read_csv(memcpy)
df_memset=pd.read_csv(memset)
#计算每个dur_sum占全部dur求和比例
kernel_dur_sum=df_kernel['dur'].sum()
runtime_dur_sum=df_runtime['dur'].sum()
driver_dur_sum=df_driver['dur'].sum()
memcpy_dur_sum=df_memcpy['dur'].sum()
memset_dur_sum=df_memset['dur'].sum()
total_dur=kernel_dur_sum+runtime_dur_sum+driver_dur_sum+memcpy_dur_sum+memset_dur_sum
kernel_dur_sum_ratio=kernel_dur_sum/total_dur
runtime_dur_sum_ratio=runtime_dur_sum/total_dur
driver_dur_sum_ratio=driver_dur_sum/total_dur
memcpy_dur_sum_ratio=memcpy_dur_sum/total_dur
memset_dur_sum_ratio=memset_dur_sum/total_dur

print("kernel_dur_sum_ratio:",kernel_dur_sum_ratio*100)
print("runtime_dur_sum_ratio:",runtime_dur_sum_ratio*100)
print("driver_dur_sum_ratio:",driver_dur_sum_ratio*100)
print("memcpy_dur_sum_ratio:",memcpy_dur_sum_ratio*100)
print("memset_dur_sum_ratio:",memset_dur_sum_ratio*100)



kernel_dur_sum_ratio: 1.461571891417708
runtime_dur_sum_ratio: 83.6685563709842
driver_dur_sum_ratio: 14.72396493971079
memcpy_dur_sum_ratio: 0.14141302106863116
memset_dur_sum_ratio: 0.004493776818667804


In [31]:
def time_analysis(df):
    sorted_df=df.sort_values(by='dur',ascending=False)
    df_p1=sorted_df.head(int(len(df)*0.01))
    df_p5=sorted_df.head(int(len(df)*0.05))
    df_p10=sorted_df.head(int(len(df)*0.1))
    p1_sum=df_p1['dur'].sum()
    p5_sum=df_p5['dur'].sum()
    p10_sum=df_p10['dur'].sum()
    total_sum=df['dur'].sum()
    p1_ratio=p1_sum/total_sum
    p5_ratio=p5_sum/total_sum
    p10_ratio=p10_sum/total_sum
    print("num:",len(df))
    print("max:",df['dur'].max())
    print("min:",df['dur'].min())
    print("p1_ratio:",p1_ratio)
    print("p5_ratio:",p5_ratio)
    print("p10_ratio:",p10_ratio)
    p1_mean=df_p1['dur'].mean()
    p5_mean=df_p5['dur'].mean()
    p10_mean=df_p10['dur'].mean()
    print("p1_mean:",p1_mean)
    print("p5_mean:",p5_mean)
    print("p10_mean:",p10_mean,"\n")

def compare_analysis(df1,df2): #计算df1和df2的p1 p5 p10的均值比例 df1/df2
    sorted_df1=df1.sort_values(by='dur',ascending=False)
    sorted_df2=df2.sort_values(by='dur',ascending=False)
    df1_p1=sorted_df1.head(int(len(df1)*0.01))
    df1_p5=sorted_df1.head(int(len(df1)*0.05))
    df1_p10=sorted_df1.head(int(len(df1)*0.1))
    df2_p1=sorted_df2.head(int(len(df2)*0.01))
    df2_p5=sorted_df2.head(int(len(df2)*0.05))
    df2_p10=sorted_df2.head(int(len(df2)*0.1))
    p1_mean_df1=df1_p1['dur'].mean()
    p5_mean_df1=df1_p5['dur'].mean()
    p10_mean_df1=df1_p10['dur'].mean()
    p1_mean_df2=df2_p1['dur'].mean()
    p5_mean_df2=df2_p5['dur'].mean()
    p10_mean_df2=df2_p10['dur'].mean()
    p1_ratio_mean=p1_mean_df1/p1_mean_df2
    p5_ratio_mean=p5_mean_df1/p5_mean_df2
    p10_ratio_mean=p10_mean_df1/p10_mean_df2
    print("df1/df2")
    print("p1_ratio:",p1_ratio_mean)
    print("p5_ratio:",p5_ratio_mean)
    print("p10_ratio:",p10_ratio_mean,"\n")


In [32]:
time_analysis(df_runtime)
time_analysis(df_kernel)
time_analysis(df_driver)
# time_analysis(df_memcpy)
# time_analysis(df_memset)

#compare_analysis(df_runtime,df_kernel)

num: 2162
max: 41915.095
min: 0.173
p1_ratio: 0.9349850006405536
p5_ratio: 0.9814047085227136
p10_ratio: 0.9862139299182633
p1_mean: 7188.783333333334
p5_mean: 1467.2172592592594
p10_mean: 737.2035648148149 

num: 105
max: 691.072
min: 2.24
p1_ratio: 0.24501650764116584
p5_ratio: 0.40397771752079054
p10_ratio: 0.5405773845316028
p1_mean: 691.072
p5_mean: 227.88479999999998
p10_mean: 152.4705 

num: 2967
max: 2746.107
min: 0.157
p1_ratio: 0.9636570493338655
p5_ratio: 0.9714626351063212
p10_ratio: 0.9745723333463903
p1_mean: 944.184896551724
p5_mean: 186.50777027027024
p10_mean: 93.55239527027027 



In [33]:
#cudalaunchkernel和kernel关系

#筛选df_runtime里name包括cudaLaunchKernel的行,并把这些name放入set里
launch_name=df_runtime[df_runtime['name'].str.contains('cudaLaunchKernel')]['name'].unique().tolist()
print(launch_name)

correlation_list=df_kernel['correlation'].unique().tolist()
df_runtime=df_runtime[df_runtime['correlation'].isin(correlation_list)]
df_kernel=df_kernel[df_kernel['correlation'].isin(correlation_list)]


time_stamp=min(df_runtime['start'])

special_kernel=[]
normal_kernel=[]


df_compare=pd.DataFrame(columns=['correlation','runtime_st','runtime_ed','kernel_st','kernel_ed'])
for correlation in correlation_list:
    df_runtime_temp=df_runtime[df_runtime['correlation']==correlation]
    df_kernel_temp=df_kernel[df_kernel['correlation']==correlation]
    assert len(df_runtime_temp)==1
    assert len(df_kernel_temp)==1
    runtime_st=df_runtime_temp['start'].values[0]-time_stamp
    runtime_ed=df_runtime_temp['end'].values[0]-time_stamp
    kernel_st=df_kernel_temp['start'].values[0]-time_stamp
    kernel_ed=df_kernel_temp['end'].values[0]-time_stamp
    if df_runtime_temp['name'].values[0] == launch_name[1]:
        special_kernel.append(df_kernel_temp['name'].values[0])
    else:
        normal_kernel.append(df_kernel_temp['name'].values[0])
    new_row = {'correlation': correlation, 'runtime_st': runtime_st, 'runtime_ed': runtime_ed, 'kernel_st': kernel_st, 'kernel_ed': kernel_ed}
    df_compare = pd.concat([df_compare, pd.DataFrame([new_row])], ignore_index=True)



['cudaLaunchKernel', 'cudaLaunchKernelExC_v11060']


In [34]:
ans_1=[]
overlap_kernel=[]
#比较第i行第i+1行第kernel_ed和第i+1行第runtime_st的大小，如果kernel_ed>runtime_st,则说明kernel和runtime有重叠，计入ans
for i in range(len(df_compare)-1):
    if df_compare.iloc[i]['kernel_ed']>df_compare.iloc[i+1]['runtime_st']:
        ans_1.append(df_compare.iloc[i]['correlation'])
        ans_1.append(df_compare.iloc[i+1]['correlation'])
#df_compare[df_compare['correlation'].isin(ans_1)]


#记录发生重叠的kernel的name
# for i in ans_1:
#     overlap_kernel.append(df_kernel[df_kernel['correlation']==i]['name'].values[0])
# print('kernel_launch overlap:',len(ans_1)/2)
# print('overlap kernel name:')
# for i in overlap_kernel:
#     print(i)

df=df_compare[df_compare['correlation'].isin(ans_1)]
df['kernel_name']=[df_kernel[df_kernel['correlation']==i]['name'].values[0] for i in df['correlation']]
df['runtime_name']=[df_runtime[df_runtime['correlation']==i]['name'].values[0] for i in df['correlation']]
df


# #比较runtime之间是否有重叠
# ans_2=[]
# for i in range(len(df_compare)-1):
#     if df_compare.iloc[i]['runtime_ed']>df_compare.iloc[i+1]['runtime_st']:
#         ans_2.append(df_compare.iloc[i]['correlation'])
#         ans_2.append(df_compare.iloc[i+1]['correlation'])
# print('kernel overlap:',len(ans_2))

# #比较kernel之间是否有重叠
# ans_3=[]
# for i in range(len(df_compare)-1):
#     if df_compare.iloc[i]['kernel_ed']>df_compare.iloc[i+1]['kernel_st']:
#         ans_3.append(df_compare.iloc[i]['correlation'])
#         ans_3.append(df_compare.iloc[i+1]['correlation'])
# print('runtime overlap:',len(ans_3))


Unnamed: 0,correlation,runtime_st,runtime_ed,kernel_st,kernel_ed,kernel_name,runtime_name
7,2098,97035.25,97078.25,97079.0,97219.75,_5x_cudnn_ampere_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1,cudaLaunchKernel
8,2104,97168.5,97181.25,97220.5,97266.5,_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_,cudaLaunchKernel
9,2109,97262.5,97271.0,97272.0,97274.75,_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_21CUDAFunctorOnSelf_addIlEENS_6detail5ArrayIPcLi2EEEEEviT0_T1_,cudaLaunchKernel
20,2436,109446.5,109466.25,109466.0,109487.75,_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_,cudaLaunchKernel
21,2438,109468.5,109475.0,109488.5,109494.25,_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_,cudaLaunchKernel
22,2442,109496.5,111098.75,111098.25,111197.0,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,cudaLaunchKernelExC_v11060
23,2449,111184.5,111195.5,111197.75,111218.0,_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_,cudaLaunchKernel
28,2608,113579.25,113597.25,113597.0,113602.5,_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_,cudaLaunchKernel
29,2610,113599.75,113606.0,113607.5,113615.25,_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_,cudaLaunchKernel
30,2613,113614.25,113621.0,113622.25,113674.25,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,cudaLaunchKernelExC_v11060


In [35]:
#cudaLaunchKernelExC_v11060启动的kernel
df_runtime[df_runtime['name']==launch_name[1]]
df_kernel[df_kernel['name'].isin(special_kernel)]

Unnamed: 0,start,device,context,block,name,stream,shared memory,grid,end,cat,correlation,dur
15,1732611000000000.0,0,1,25611,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize256x128x32_stage2_warpsize4x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,7,98304,1491,1732611000000000.0,conc_kernel,2278,60.512
22,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,7,49152,1982,1732611000000000.0,conc_kernel,2442,98.688
30,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,7,49152,2251,1732611000000000.0,conc_kernel,2613,52.128
37,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,7,49152,2258,1732611000000000.0,conc_kernel,2775,81.952
44,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,7,49152,2258,1732611000000000.0,conc_kernel,2870,81.12
77,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x64x32_stage4_warpsize2x2x1_g1_tensor16x8x8_execute_kernel__5x_cudnn,7,100352,824,1732611000000000.0,conc_kernel,4123,38.304
85,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x64x32_stage4_warpsize2x2x1_g1_tensor16x8x8_execute_kernel__5x_cudnn,7,100352,824,1732611000000000.0,conc_kernel,4222,37.952
93,1732611000000000.0,0,1,12811,sm86_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x64x32_stage4_warpsize2x2x1_g1_tensor16x8x8_execute_kernel__5x_cudnn,7,100352,824,1732611000000000.0,conc_kernel,4325,38.016


In [36]:
#kernel分析

import warnings
warnings.filterwarnings("ignore")

log_path="./log"
rank_log_dict = {0: 'trace_con.json'}
analyzer = TraceAnalysis(trace_dir=log_path,trace_files=rank_log_dict)
#temporal_breakdown_df = analyzer.get_temporal_breakdown(visualize=False)
kernel_type_metrics_df,kernel_breakdown_df = analyzer.get_gpu_kernel_breakdown(visualize=False,num_kernels=200)
kernel_breakdown_df.sort_values(by='sum (us)',ascending=False).reset_index(drop=True)



2024-12-03 17:27:05,026 - hta - trace.py:L389 - INFO - /data/zkx/cupti_to_csv/log
2024-12-03 17:27:05,030 - hta - trace.py:L535 - INFO - ranks=[0]
2024-12-03 17:27:05,047 - hta - trace.py:L118 - INFO - Parsed /data/zkx/cupti_to_csv/log/trace_con.json time = 0.02 seconds 


Unnamed: 0,name,sum (us),max (us),min (us),stddev,mean (us),kernel_type,rank
0,_ZN8internal5gemvx6kernelIiiffffLb0ELb1ELb1ELb0ELi7ELb0E16cublasGemvParamsI30cublasGemvTensorStridedBatchedIKfES5_S3_IfEfEEENSt9enable_ifIXntT5_EvE4typeET11_,722.112,691.072,31.04,466.713103,361.056,COMPUTATION,0
1,sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn,313.888,98.688,52.128,19.337488,78.472,COMPUTATION,0
2,_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_,304.607,35.104,2.848,12.397366,13.845773,COMPUTATION,0
3,Memcpy DtoD,186.976,47.072,2.432,16.214812,14.382769,MEMORY,0
4,_ZN17cutlass__5x_cudnn6KernelI66cutlass_tensorop_s1688fprop_optimized_tf32_128x64_32x3_nhwc_align4EEvNT_6ParamsE,185.696,92.96,92.736,0.158392,92.848,COMPUTATION,0
5,_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_49_GLOBAL__N__d2ba64fb_16_TensorCompare_cu_71e06f4e19launch_clamp_scalarERNS_18TensorIteratorBaseEN3c106ScalarES6_NS0_6detail11ClampLimitsEENKUlvE_clEvENKUlvE5_clEvEUlfE_NS_6detail5ArrayIPcLi2EEEEEviT0_T1_,180.927,47.328,2.463,16.437468,13.917462,COMPUTATION,0
6,_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_,177.344,46.112,3.488,15.144885,13.641846,COMPUTATION,0
7,_ZN5cudnn25bn_fw_tr_1C11_kernel_NCHWIffiLi512ELb1ELi1ELb1EEEv17cudnnTensorStructPKT_S1_PS2_PKT0_S8_S6_S6_PS6_S9_S9_S9_S6_S6_,175.265,65.216,22.048,24.169716,43.81625,COMPUTATION,0
8,_5x_cudnn_ampere_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1,140.736,140.736,140.736,0.0,140.736,COMPUTATION,0
9,_Z17gemv2T_kernel_valIiiffffLi128ELi16ELi4ELi4ELb0ELb1E16cublasGemvParamsI30cublasGemvTensorStridedBatchedIKfES3_S1_IfEfEEvT11_T4_S7_,115.968,115.968,115.968,0.0,115.968,COMPUTATION,0


## no profiler + kernel

In [37]:
import pandas as pd
import numpy as np
import os

path="trans_to_csv/trace_no_profiler_res"
type_name=["kernel","cuda_runtime"]
file_path=[os.path.join(path,for_type)+'.csv' for for_type in type_name]
runtime=file_path[1]
kernel=file_path[0]



df_runtime=pd.read_csv(runtime)
df_kernel=pd.read_csv(kernel)


launch_name=df_runtime[df_runtime['name'].str.contains('cudaLaunchKernel')]['name'].unique().tolist()

correlation_list=df_kernel['correlation'].unique().tolist()
df_runtime=df_runtime[df_runtime['correlation'].isin(correlation_list)]
df_kernel=df_kernel[df_kernel['correlation'].isin(correlation_list)]


time_stamp=min(df_runtime['start'])

df_compare=pd.DataFrame(columns=['correlation','runtime_st','runtime_ed','kernel_st','kernel_ed'])
for correlation in correlation_list:
    df_runtime_temp=df_runtime[df_runtime['correlation']==correlation]
    df_kernel_temp=df_kernel[df_kernel['correlation']==correlation]
    assert len(df_runtime_temp)==1
    assert len(df_kernel_temp)==1
    runtime_st=df_runtime_temp['start'].values[0]-time_stamp
    runtime_ed=df_runtime_temp['end'].values[0]-time_stamp
    kernel_st=df_kernel_temp['start'].values[0]-time_stamp
    kernel_ed=df_kernel_temp['end'].values[0]
    new_row = {'correlation': correlation, 'runtime_st': runtime_st, 'runtime_ed': runtime_ed, 'kernel_st': kernel_st, 'kernel_ed': kernel_ed}
    df_compare = pd.concat([df_compare, pd.DataFrame([new_row])], ignore_index=True)



In [2]:
import pandas as pd
df_1=pd.read_csv('unique_kernels.csv')
df_2=pd.read_csv('unique_kernels_1.csv')
df_3=pd.read_csv('conc_kernel.csv')

name1=df_1['kernel'].unique().tolist()
name2=df_2['name'].unique().tolist()
name3=df_3['name'].unique().tolist()


set1=set(name1) #profiler里的kernel  -3
set2=set(name2) #trace_profiler 里的kernel  -3
set3=set(name3) #trace_no_profiler 里的kernel 20

In [4]:
set3-set1

{'sm86_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x64x32_stage4_warpsize2x2x1_g1_tensor16x8x8_execute_kernel__5x_cudnn',
 'sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage3_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn',
 'sm86_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize256x128x32_stage2_warpsize4x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn'}

In [5]:
set3-set2

{'_Z17gemv2T_kernel_valIiiffffLi128ELi16ELi4ELi4ELb0ELb1E16cublasGemvParamsI30cublasGemvTensorStridedBatchedIKfES3_S1_IfEfEEvT11_T4_S7_',
 '_ZN2at6native43_GLOBAL__N__50f1b6c4_10_Dropout_cu_0e96ed3824fused_dropout_kernel_vecIffjLi1ELi4EbEEvNS_4cuda6detail10TensorInfoIKT_T1_EENS5_IS6_S8_EENS5_IT4_S8_EES8_T0_NS_15PhiloxCudaStateE',
 '_ZN8internal5gemvx6kernelIiiffffLb0ELb1ELb1ELb0ELi7ELb0E16cublasGemvParamsI30cublasGemvTensorStridedBatchedIKfES5_S3_IfEfEEENSt9enable_ifIXntT5_EvE4typeET11_'}