-
Notifications
You must be signed in to change notification settings - Fork 492
/
gpgpusim.config
199 lines (169 loc) · 7.16 KB
/
gpgpusim.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# This config models the Volta Titan V
# For more info about volta architecture:
# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1#
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
# https://devblogs.nvidia.com/inside-volta/
# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 70
# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
# Compute Capability
-gpgpu_compute_capability_major 7
-gpgpu_compute_capability_minor 0
# SASS execution (only supported with CUDA >= 4.0)
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0
# high level architecture configuration
-gpgpu_n_clusters 40
-gpgpu_n_cores_per_cluster 2
-gpgpu_n_mem 24
-gpgpu_n_sub_partition_per_mchannel 2
# volta clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
# Volta NVIDIA TITANV clock domains are adopted from
# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
-gpgpu_clock_domains 1200.0:1200.0:1200.0:850.0
# boost mode
# -gpgpu_clock_domains 1455.0:1455.0:1455.0:850.0
# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_occupancy_sm_number 70
# This implies a maximum of 64 warps/SM
-gpgpu_shader_core_pipeline 2048:32
-gpgpu_shader_cta 32
-gpgpu_simd_model 1
# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
## Volta TITANV has 4 SP SIMD units, 4 INT units, 4 SFU units, 4 DP units per core, 4 Tensor core units
## we need to scale the number of pipeline registers to be equal to the number of SP units
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4
# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
# Throughput (initiation latency) are adopted from
# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
-ptx_opcode_latency_int 4,13,4,5,145
-ptx_opcode_initiation_int 2,2,2,2,8
-ptx_opcode_latency_fp 4,13,4,5,39
-ptx_opcode_initiation_fp 2,2,2,2,4
-ptx_opcode_latency_dp 8,19,8,8,330
-ptx_opcode_initiation_dp 4,4,4,4,130
-ptx_opcode_latency_sfu 100
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 64
-ptx_opcode_initiation_tensor 64
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
# ** Optional parameter - Required when mshr_type==Texture Fifo
# Defualt config is 32KB DL1 and 96KB shared memory
# In Volta, we assign the remaining shared memory to L1 cache
# if the assigned shd mem = 0, then L1 cache = 128KB
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
# disable this mode in case of multi kernels/apps execution
-adaptive_volta_cache_config 1
# Volta unified cache has four ports
-mem_unit_ports 4
-gpgpu_cache:dl1 S:4:128:64,L:L:s:N:L,A:256:8,16:0,32
-gpgpu_shmem_size 98304
-gmem_skip_L1D 0
-icnt_flit_size 40
-gpgpu_n_cluster_ejection_buffer_size 32
-l1_latency 28
-smem_latency 19
-gpgpu_flush_l1_cache 1
# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:L,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-perf_sim_memcpy 1
-memory_partition_indexing 0
# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
# 48 KB Tex
# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:16:128:24,L:R:m:N:L,T:128:4,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
# Volta has sub core model, in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# volta has 8 banks, 4 schedulers, two banks per scheduler
-gpgpu_num_reg_banks 8
# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 60
## In Volta, a warp scheduler can issue 1 inst per cycle
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1
# interconnection
-network_mode 1
-inter_config_file config_volta_islip.icnt
# memory partition latency config
-rop_latency 120
-dram_latency 100
# dram model config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192
# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 16
-gpgpu_dram_burst_length 2
-dram_data_command_freq_ratio 2 # HBM is DDR
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
# Timing for 1 GHZ
# tRRDl and tWTR are missing, need to be added
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
# Timing for 850 MHZ, Tesla TITANV HBM runs at 850 MHZ
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
# HBM has dual bus interface, in which it can issue two col and row commands at a time
-dual_bus_interface 1
# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 0
-dram_bnkgrp_indexing_policy 1
#-Seperate_Write_Queue_Enable 1
#-Write_Queue_Size 64:56:32
# Volta has four schedulers per core
-gpgpu_num_sched_per_core 4
# Two Level Scheduler with active and pending pools
#-gpgpu_scheduler two_level_active:6:0:1
# Loose round robbin scheduler
#-gpgpu_scheduler lrr
# Greedy then oldest scheduler
-gpgpu_scheduler gto
# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0
# power model configs, disable it untill we create a real energy model for Volta
-power_simulation_enabled 0
# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0