-
Notifications
You must be signed in to change notification settings - Fork 0
/
nvprof_time_3.log
148 lines (146 loc) · 23.4 KB
/
nvprof_time_3.log
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
==2677== NVPROF is profiling process 2677, command: /home/lanchuanxin/ProgramFile/anaconda3/envs/tensorflow_gpu/bin/python label_image.py new_test_images_nvprof/
==2677== Profiling application: /home/lanchuanxin/ProgramFile/anaconda3/envs/tensorflow_gpu/bin/python label_image.py new_test_images_nvprof/
==2677== Profiling result:
"Time(%)","Time","Calls","Avg","Min","Max","Name"
%,ms,,ms,ms,ms,
12.524895,139.372860,7332,0.019008,0.004224,0.259401,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_17TensorReshapingOpIKNS_6DSizesIiLi2EEENS_9TensorMapINS_6TensorIfLi4ELi1ElEELi16ENS_11MakePointerEEEEEKNS_19TensorCwiseBinaryOpINS0_13scalar_sum_opIffEEKNSE_INS0_17scalar_product_opIffEEKNSE_INS0_20scalar_difference_opIKfSK_EEKNS4_IS7_KNS8_INS9_ISK_Li4ELi1ElEELi16ESB_EEEEKNS_20TensorBroadcastingOpIKNS_9IndexListIiJNS_10type2indexILl1EEEEEEKNS4_IKNSS_ISU_JiEEEKNS8_INS9_ISK_Li1ELi1ElEELi16ESB_EEEEEEEEKNSR_ISW_KNS4_ISY_KNS_18TensorForcedEvalOpIKNS_18TensorCwiseUnaryOpINS0_15scalar_rsqrt_opIfEEKNSE_INSF_ISK_SK_EES11_KNS_20TensorCwiseNullaryOpINS0_18scalar_constant_opISK_EES11_EEEEEEEEEEEEEES15_EEEENS_9GpuDeviceEEElEEvT_T0_"
9.574521,106.542082,2028,0.052535,0.045570,0.065442,"maxwell_sgemm_128x64_raggedMn_nn_splitK"
8.930946,99.380597,785,0.126599,0.104835,0.461135,"maxwell_scudnn_128x128_relu_small_nn"
7.073354,78.709929,1884,0.041778,0.023073,0.140421,"void fft1d_r2c_32<float, float, float2, bool=1, bool=0>(float2*, float const *, int, int3, int3, int2, int2)"
6.611527,73.570875,1884,0.039050,0.028193,0.152069,"maxwell_gcgemmBatched_64x32_raggedMn_nt"
5.837879,64.961977,4212,0.015423,0.003232,0.097252,"void tensorflow::functor::SwapDimension0And2InTensor3Simple<float>(int, float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension0And2InTensor3Simple<float>*)"
4.774477,53.128796,702,0.075682,0.029153,0.117444,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_18TensorLayoutSwapOpINS_9TensorMapINS_6TensorIfLi4ELi1EiEELi16ENS_11MakePointerEEEEEKNS_17TensorReshapingOpIKNS_6DSizesIiLi4EEEKNS_17TensorReductionOpINS0_18AvgPoolMeanReducerIfEEKNS_9IndexListINS_10type2indexILl1EEEJNSJ_ILl2EEEEEEKNS_18TensorImagePatchOpILln1ELln1EKNS4_IKNS5_INS6_IKfLi4ELi1EiEELi16ES8_EEEEEES8_EEEEEENS_9GpuDeviceEEEiEEvT_T0_"
4.213500,46.886422,942,0.049773,0.021057,0.121124,"maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt"
3.483724,38.765721,8346,0.004644,0.002144,0.031777,"void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<float, int=32, int=8>(float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<float, int=32, int=8>*)"
3.398350,37.815713,1014,0.037293,0.026785,0.049409,"void magma_lds128_sgemm_kernel<bool=0, bool=0, int=6, int=5, int=3, int=3, int=3>(int, int, int, float const *, int, float const *, int, float*, int, int, int, float const *, float const *, float, float, int)"
3.144739,34.993605,244,0.143416,0.113411,0.164197,"maxwell_scudnn_128x64_relu_small_nn"
2.758995,30.701179,172,0.178495,0.059682,0.335275,"void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *)"
1.849381,20.579295,864,0.023818,0.021216,0.031873,"void fft1d_r2c_32<float, float, float2, bool=0, bool=0>(float2*, float const *, int, int3, int3, int2, int2)"
1.816087,20.208809,258,0.078328,0.016320,0.109060,"void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const *, float*, int)"
1.660138,18.473468,7722,0.002392,0.001120,0.019777,"void tensorflow::_GLOBAL__N__71_tmpxft_000022f8_00000000_17_check_numerics_op_gpu_cu_compute_61_cpp1_ii_3f470e8b::CheckNumericsKernel<float>(float const *, int, int*)"
1.620268,18.029801,549,0.032841,0.000704,0.519665,"[CUDA memcpy HtoD]"
1.512907,16.835126,7332,0.002296,0.001408,0.023937,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, long>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const , float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const > const , Eigen::GpuDevice>, long>(float, int=1)"
1.325088,14.745134,1020,0.014456,0.011360,0.021185,"void fft1d_r2c_32<float, float, float2, bool=0, bool=1>(float2*, float const *, int, int3, int3, int2, int2)"
1.235194,13.744831,19,0.723412,0.204198,3.532882,"void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2**, float2**, float2**, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)"
1.161024,12.919485,942,0.013714,0.003296,0.050273,"void cudnn::winograd::generateWinogradTilesKernel<int=0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)"
1.146796,12.761165,9751,0.001308,0.000583,0.020647,"[CUDA memset]"
1.031080,11.473517,7332,0.001564,0.001376,0.008256,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorEvalToOp<Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_rsqrt_op<float>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float const , float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const > const , Eigen::MakePointer> const , Eigen::GpuDevice>, long>(float, Eigen::internal::scalar_rsqrt_op<float>)"
1.022586,11.378995,86,0.132313,0.043937,0.208006,"void cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, float const *, float const *)"
0.952226,10.596055,18,0.588669,0.005952,4.879838,"void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)"
0.899119,10.005100,1020,0.009808,0.009472,0.010272,"void fft1d_c2r_32<float2, float, float, bool=0, bool=1, bool=0, bool=1>(float*, float2 const *, int, int3, int3, int2, int, float, float, float*, float*)"
0.864350,9.618198,864,0.011132,0.010816,0.011808,"void fft1d_c2r_32<float2, float, float, bool=0, bool=1, bool=0, bool=0>(float*, float2 const *, int, int3, int3, int2, int, float, float, float*, float*)"
0.754839,8.399595,159,0.052827,0.022144,0.088547,"maxwell_scudnn_128x32_relu_interior_nn"
0.745335,8.293836,1,8.293836,8.293836,8.293836,"void zeroPad128<float, float, bool=1>(float const *, float*, int, int, int, int, int, int, int, int)"
0.707570,7.873600,390,0.020188,0.007648,0.040257,"void tensorflow::_GLOBAL__N__67_tmpxft_00000a7e_00000000_17_maxpooling_op_gpu_cu_compute_61_cpp1_ii_102aff11::MaxPoolForwardNHWC<float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, tensorflow::_GLOBAL__N__67_tmpxft_00000a7e_00000000_17_maxpooling_op_gpu_cu_compute_61_cpp1_ii_102aff11::MaxPoolForwardNHWC<float>*, __int64*)"
0.682010,7.589176,47,0.161471,0.056738,2.282378,"maxwell_gcgemmBatched_32x32_raggedMn_nt"
0.649203,7.224119,3588,0.002013,0.001248,0.003200,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, unsigned long=2> const , Eigen::array<int, unsigned long=2> const , Eigen::TensorMap<Eigen::Tensor<float, int=2, int=1, int>, int=16, Eigen::MakePointer>>, Eigen::TensorMap<Eigen::Tensor<float const , int=2, int=1, int>, int=16, Eigen::MakePointer> const > const , Eigen::GpuDevice>, int>(int, unsigned long=2)"
0.642698,7.151726,7878,0.000907,0.000640,0.011136,"[CUDA memcpy DtoH]"
0.587090,6.532947,2,3.266473,2.194023,4.338924,"void zeroPad256<float, float, bool=1>(float const *, float*, int, int, int, int, int, int, int, int)"
0.427751,4.759866,2,2.379933,0.037345,4.722521,"void DSE::vector_fft<int=0, int=1, int=128, int=8, int=8, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *)"
0.425413,4.733849,2,2.366924,0.045441,4.688408,"void DSE::regular_fft<int=0, int=1, int=128, int=16, int=32, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *, int, void const *, void const *, int)"
0.364601,4.057156,4,1.014289,0.071810,2.582164,"void DSE::regular_fft<int=0, int=1, int=256, int=16, int=16, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *, int, void const *, void const *, int)"
0.363637,4.046436,11,0.367857,0.026241,0.652790,"void fft2d_r2c_32x32<float, unsigned int=1, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)"
0.355553,3.956481,4,0.989120,0.057378,2.546195,"void DSE::vector_fft<int=0, int=1, int=256, int=16, int=16, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *)"
0.333996,3.716599,6,0.619433,0.029249,3.185831,"void fft2d_r2c_32x32<float, unsigned int=1, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)"
0.260169,2.895070,16,0.180941,0.050465,0.577715,"void cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, float const *, kernel_conv_params, int, float, float, int, float const *, float const *, int, int)"
0.236486,2.631542,78,0.033737,0.032897,0.036737,"maxwell_sgemm_128x128_raggedMn_nn"
0.223907,2.491568,25,0.099662,0.096387,0.105988,"maxwell_gcgemmBatched_64x64_raggedMn_nt"
0.195734,2.178066,1194,0.001824,0.001216,0.008544,"cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)"
0.173497,1.930622,6,0.321770,0.021824,0.716247,"void fft2d_r2c_16x16<float>(float2*, float const *, int, int, int, int, int, int, int, int)"
0.158195,1.760337,78,0.022568,0.015937,0.023905,"void tensorflow::_GLOBAL__N__72_tmpxft_0000553c_00000000_17_resize_bilinear_op_gpu_cu_compute_61_cpp1_ii_c041e0eb::ResizeBilinearKernel<float>(int, float const *, float, float, int, int, int, int, int, int, float*)"
0.156064,1.736632,7,0.248090,0.124004,0.470960,"void cudnn::detail::implicit_convolve_sgemm<float, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, float const *, kernel_conv_params, int, float, float, int, float const *, float const *, int, int)"
0.142572,1.586489,78,0.020339,0.004128,0.024865,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, long>, int=16, Eigen::MakePointer>, Eigen::TensorConversionOp<float, Eigen::TensorMap<Eigen::Tensor<unsigned char const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, long>(float, int=1)"
0.118075,1.313898,16,0.082118,0.006816,0.292329,"void flip_filter<float, float>(float*, float const *, int, int, int, int)"
0.114578,1.274985,81,0.015740,0.007617,0.030081,"void fft2d_r2c_32x32<float, unsigned int=0, bool=0>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)"
0.088195,0.981405,6,0.163567,0.013312,0.455246,"void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, int, int, int)"
0.079812,0.888123,71,0.012508,0.008032,0.017697,"void fft2d_c2r_32x32<float, bool=0, unsigned int=0, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*)"
0.078731,0.876096,78,0.011232,0.011041,0.011904,"void tensorflow::functor::SwapDimension1And2InTensor3Simple<float>(int, float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3Simple<float>*)"
0.069779,0.776472,78,0.009954,0.009665,0.010432,"void gemv2N_kernel_val<float, float, float, int=128, int=32, int=4, int=4, int=1>(float, float, cublasGemv2Params_v2<float, float, float>)"
0.051008,0.567603,3,0.189201,0.169765,0.201287,"maxwell_scudnn_128x64_relu_interior_nn"
0.038426,0.427594,78,0.005481,0.005312,0.005665,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_right<float, float, Eigen::internal::scalar_product_op<float, float>>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, int>(float, int=1)"
0.038355,0.426807,78,0.005471,0.005280,0.006113,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_right<float, float, Eigen::internal::scalar_difference_op<float, float>>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, int>(float, int=1)"
0.034570,0.384684,2,0.192342,0.181318,0.203366,"void zeroPad256<float, float, bool=0>(float const *, float*, int, int, int, int, int, int, int, int)"
0.028387,0.315882,1,0.315882,0.315882,0.315882,"maxwell_scudnn_128x128_relu_interior_nn"
0.027265,0.303401,2,0.151700,0.123204,0.180197,"void DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *, int, void const *, void const *, int, int, int, int, int, float, float, bool, int, DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, int=2, float2, float>, DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, int=2, float2, float>)"
0.026248,0.292074,78,0.003744,0.003520,0.004128,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_18TensorCwiseUnaryOpINS0_17scalar_inverse_opIfEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_9IndexListINS_10type2indexILl1EEEJEEEKNS_9TensorMapINS_6TensorIfLi2ELi1ElEELi16ENS_11MakePointerEEESI_EEEESI_EENS_9GpuDeviceEEElEEvT_T0_"
0.023492,0.261415,78,0.003351,0.003200,0.003552,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10MaxReducerIfEEKNS_9IndexListINS_10type2indexILl1EEEJEEEKNS_9TensorMapINS_6TensorIKfLi2ELi1ElEELi16ENS_11MakePointerEEESG_EESG_EENS_9GpuDeviceEEElEEvT_T0_"
0.019625,0.218378,78,0.002799,0.002688,0.003072,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi2ELi1ElEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_13scalar_exp_opIfEEKNS_19TensorCwiseBinaryOpINS0_20scalar_difference_opIKfSE_EEKNS4_INS5_ISE_Li2ELi1ElEELi16ES7_EEKNS_20TensorBroadcastingOpIKNS_9IndexListINS_10type2indexILl1EEEJiEEEKNS_17TensorReshapingOpIKNSK_IiJSM_EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10MaxReducerIfEEKNSK_ISM_JEEESI_S7_EEEEEEEEEEEEEENS_9GpuDeviceEEElEEvT_T0_"
0.019141,0.212999,1,0.212999,0.212999,0.212999,"void cudnn::detail::implicit_convolve_sgemm<float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, float const *, kernel_conv_params, int, float, float, int, float const *, float const *, int, int)"
0.017911,0.199303,78,0.002555,0.002464,0.002720,"_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi2ELi1ElEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_17scalar_product_opIffEEKS8_KNS_20TensorBroadcastingOpIKNS_9IndexListINS_10type2indexILl1EEEJiEEEKNS_17TensorReshapingOpIKNSE_IiJSG_EEEKNS_18TensorForcedEvalOpIKNS_18TensorCwiseUnaryOpINS0_17scalar_inverse_opIfEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNSE_ISG_JEEESC_S7_EEEEEEEEEEEEEENS_9GpuDeviceEEElEEvT_T0_"
0.017249,0.191943,2,0.095971,0.073923,0.118020,"void DSE::vector_fft<int=1, int=2, int=256, int=16, int=16, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *)"
0.016007,0.178118,19,0.009374,0.001664,0.059074,"compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)"
0.015768,0.175458,11,0.015950,0.014976,0.017600,"void fft2d_c2r_32x32<float, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*)"
0.013709,0.152547,78,0.001955,0.001856,0.002144,"void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const , Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, long>(float, int=1)"
0.013174,0.146596,2,0.073298,0.060257,0.086339,"maxwell_scudnn_128x32_relu_small_nn"
0.012654,0.140804,1,0.140804,0.140804,0.140804,"void DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *, int, void const *, void const *, int, int, int, int, int, float, float, bool, int, DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, int=2, float2, float>, DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, int=2, float2, float>)"
0.007164,0.079714,1,0.079714,0.079714,0.079714,"void zeroPad128<float, float, bool=0>(float const *, float*, int, int, int, int, int, int, int, int)"
0.007060,0.078563,1,0.078563,0.078563,0.078563,"void DSE::vector_fft<int=1, int=2, int=128, int=8, int=8, int=1, int=2, float2, float>(void*, void const *, int, int3, void const *)"
0.006491,0.072227,3,0.024075,0.023905,0.024193,"void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*)"
0.004443,0.049441,1,0.049441,0.049441,0.049441,"void fft2d_r2c_32x32<float, unsigned int=5, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool)"
0.003212,0.035746,3,0.011915,0.011584,0.012161,"void fft2d_c2r_16x16<float, bool=0>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*)"
==2677== System profiling result:
"","Device","Count","Avg","Min","Max"
"SM Clock (MHz)","TITAN Xp (0)",111,1467.333333,1442.000000,1480.000000
"Memory Clock (MHz)","TITAN Xp (0)",111,5511.495495,5508.000000,5702.000000
"Temperature (C)","TITAN Xp (0)",222,42.216216,41.000000,43.000000
"Power (mW)","TITAN Xp (0)",222,67446.067568,62327.000000,114737.000000
"Fan (%)","TITAN Xp (0)",111,23.000000,23.000000,23.000000
==2677== API calls:
"Time(%)","Time","Calls","Avg","Min","Max","Name"
%,s,,ms,ms,ms,
31.404481,1.397244,63649,0.021952,0.006518,517.581843,"cudaLaunch"
20.096560,0.894133,8,111.766653,0.002198,893.458105,"cudaStreamCreateWithFlags"
11.141569,0.495709,7722,0.064194,0.002950,19.206358,"cuStreamSynchronize"
10.629290,0.472917,1,472.916808,472.916808,472.916808,"cuDevicePrimaryCtxRetain"
10.613547,0.472216,3,157.405468,0.001746,472.207307,"cudaFree"
2.723663,0.121181,301089,0.000402,0.000207,1.127813,"cudaSetupArgument"
2.515217,0.111907,7878,0.014204,0.007186,1.137561,"cuMemcpyDtoHAsync"
2.012809,0.089554,7488,0.011959,0.003405,0.193367,"cudaStreamAddCallback"
1.932634,0.085986,116,0.741262,0.004562,26.229276,"cuEventSynchronize"
1.505865,0.066999,7722,0.008676,0.004335,1.962102,"cuMemsetD32Async"
1.025630,0.045632,32275,0.001413,0.000725,0.020333,"cudaStreamWaitEvent"
0.863639,0.038425,63649,0.000603,0.000216,1.082161,"cudaConfigureCall"
0.826703,0.036782,21001,0.001751,0.000713,0.900657,"cudaEventRecord"
0.688389,0.030628,1,30.627703,30.627703,30.627703,"cuMemAlloc"
0.384945,0.017127,2028,0.008445,0.004919,0.022962,"cudaMemsetAsync"
0.249542,0.011103,2568,0.004323,0.001664,0.929958,"cudaBindTexture"
0.168665,0.007504,1891,0.003968,0.001382,0.024929,"cudaEventCreate"
0.161564,0.007188,17125,0.000419,0.000204,0.022504,"cudaGetLastError"
0.151122,0.006724,3,2.241225,1.221855,3.645352,"cuMemHostAlloc"
0.145355,0.006467,2028,0.003188,0.001605,0.018418,"cudaEventQuery"
0.144930,0.006448,547,0.011788,0.005228,0.043820,"cuMemcpyHtoDAsync"
0.099678,0.004435,1287,0.003445,0.001273,0.023363,"cuEventQuery"
0.097328,0.004330,1891,0.002289,0.001162,0.016437,"cudaEventDestroy"
0.084194,0.003746,2568,0.001458,0.000616,0.019912,"cudaUnbindTexture"
0.064927,0.002889,1638,0.001763,0.000361,0.037970,"cuEventRecord"
0.046151,0.002053,6,0.342225,0.005967,1.955736,"cudaMalloc"
0.039570,0.001761,4,0.440140,0.155111,1.147860,"cuDeviceTotalMem"
0.037748,0.001679,275,0.006107,0.000197,0.291168,"cuDeviceGetAttribute"
0.036148,0.001608,390,0.004123,0.002300,0.030644,"cudaStreamQuery"
0.036047,0.001604,78,0.020561,0.003949,0.092277,"cuCtxSynchronize"
0.020474,0.000911,703,0.001295,0.000600,0.003653,"cuStreamWaitEvent"
0.013290,0.000591,1,0.591307,0.591307,0.591307,"cudaGetDeviceProperties"
0.006718,0.000299,1,0.298918,0.298918,0.298918,"cuDeviceGetProperties"
0.006123,0.000272,4,0.068109,0.059952,0.082634,"cuDeviceGetName"
0.005748,0.000256,237,0.001079,0.000436,0.010787,"cuEventCreate"
0.004184,0.000186,232,0.000802,0.000409,0.004222,"cuEventDestroy"
0.003853,0.000171,116,0.001477,0.001260,0.002993,"cuEventElapsedTime"
0.002379,0.000106,2,0.052927,0.027374,0.078481,"cudaMemcpy"
0.002052,0.000091,2,0.045645,0.040672,0.050618,"cuMemGetInfo"
0.001547,0.000069,4,0.017211,0.004092,0.053583,"cuStreamCreate"
0.001353,0.000060,40,0.001504,0.000639,0.011980,"cudaEventCreateWithFlags"
0.001033,0.000046,1,0.045944,0.045944,0.045944,"cuMemsetD32"
0.000678,0.000030,32,0.000943,0.000404,0.005415,"cudaDeviceGetAttribute"
0.000662,0.000029,9,0.003270,0.000859,0.004540,"cuCtxSetCurrent"
0.000366,0.000016,1,0.016293,0.016293,0.016293,"cudaStreamCreateWithPriority"
0.000349,0.000016,3,0.005177,0.004477,0.006498,"cudaGetDevice"
0.000318,0.000014,9,0.001573,0.000255,0.009078,"cuDeviceGetCount"
0.000209,0.000009,2,0.004660,0.003320,0.006000,"cuDeviceGetPCIBusId"
0.000207,0.000009,1,0.009193,0.009193,0.009193,"cudaDeviceGetStreamPriorityRange"
0.000117,0.000005,4,0.001304,0.000520,0.003506,"cuDriverGetVersion"
0.000104,0.000005,1,0.004613,0.004613,0.004613,"cudaGetDeviceCount"
0.000101,0.000005,3,0.001501,0.001147,0.002064,"cuInit"
0.000095,0.000004,1,0.004216,0.004216,0.004216,"cuCtxGetCurrent"
0.000083,0.000004,6,0.000617,0.000298,0.000989,"cuDeviceGet"
0.000026,0.000001,1,0.001139,0.001139,0.001139,"cuDeviceComputeCapability"
0.000018,0.000001,1,0.000817,0.000817,0.000817,"cuDevicePrimaryCtxGetState"