Skip to content
Permalink
Browse files

Enable support for next generation AMD Zen CPU, via -march=znver2.

gcc/ChangeLog:
	* common/config/i386/i386-common.c (processor_alias_table): Add znver2 entry.
	* config.gcc (i[34567]86-*-linux* | ...): Add znver2.
	(case ${target}): Add znver2.
	* config/i386/driver-i386.c: (host_detect_local_cpu): Let
	-march=native recognize znver2 processors.
	* config/i386/i386-c.c (ix86_target_macros_internal): Add znver2.
	* config/i386/i386.c (m_znver2): New definition.
	(m_ZNVER): New definition.
	(m_AMD_MULTIPLE): Includes m_znver2.
	(processor_cost_table): Add znver2 entry.
	(processor_target_table): Add znver2 entry.
	(get_builtin_code_for_version): Set priority for
	PROCESSOR_ZNVER2.
	(processor_model): Add M_AMDFAM17H_ZNVER2.
	(arch_names_table): Ditto.
	(ix86_reassociation_width): Include znver2. 
	* config/i386/i386.h (TARGET_znver2): New definition.
	(struct ix86_size_cost): Add TARGET_ZNVER2.
	(enum processor_type): Add PROCESSOR_ZNVER2.
	* config/i386/i386.md (define_attr "cpu"): Add znver2.
	* config/i386/x86-tune-costs.h: (processor_costs) Add znver2 costs.
	* config/i386/x86-tune-sched.c: (ix86_issue_rate): Add znver2.
	(ix86_adjust_cost): Add znver2.
	* config/i386/x86-tune.def:  Replace m_ZNVER1 by m_ZNVER
	* gcc/doc/extend.texi: Add details about znver2.
	* gcc/doc/invoke.texi: Add details about znver2.

libgcc/ChangeLog
	* config/i386/cpuinfo.c: (get_amd_cpu): Add znver2.
	* config/i386/cpuinfo.h(processor_subtypes): Ditto.


git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@265775 138bc75d-0d04-0410-961f-82ee72b054a4
  • Loading branch information...
vekumar
vekumar committed Nov 4, 2018
1 parent 0acb32e commit 703d2f69fad4772dc4aedb5327b5e3d88e8e1843
@@ -1,3 +1,33 @@
2018-11-04 Venkataramanan Kumar <venkataramanan.kumar@amd.com>

* common/config/i386/i386-common.c (processor_alias_table): Add
znver2 entry.
* config.gcc (i[34567]86-*-linux* | ...): Add znver2.
(case ${target}): Add znver2.
* config/i386/driver-i386.c: (host_detect_local_cpu): Let
-march=native recognize znver2 processors.
* config/i386/i386-c.c (ix86_target_macros_internal): Add znver2.
* config/i386/i386.c (m_znver2): New definition.
(m_ZNVER): New definition.
(m_AMD_MULTIPLE): Includes m_znver2.
(processor_cost_table): Add znver2 entry.
(processor_target_table): Add znver2 entry.
(get_builtin_code_for_version): Set priority for
PROCESSOR_ZNVER2.
(processor_model): Add M_AMDFAM17H_ZNVER2.
(arch_names_table): Ditto.
(ix86_reassociation_width): Include znver2.
* config/i386/i386.h (TARGET_znver2): New definition.
(struct ix86_size_cost): Add TARGET_ZNVER2.
(enum processor_type): Add PROCESSOR_ZNVER2.
* config/i386/i386.md (define_attr "cpu"): Add znver2.
* config/i386/x86-tune-costs.h: (processor_costs) Add znver2 costs.
* config/i386/x86-tune-sched.c: (ix86_issue_rate): Add znver2.
(ix86_adjust_cost): Add znver2.
* config/i386/x86-tune.def: Replace m_ZNVER1 by m_ZNVER.
* gcc/doc/extend.texi: Add details about znver2.
* gcc/doc/invoke.texi: Add details about znver2.

2018-11-03 Sandra Loosemore <sandra@codesourcery.com>

PR target/87079
@@ -1677,6 +1677,16 @@ const pta processor_alias_table[] =
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
| PTA_SHA | PTA_LZCNT | PTA_POPCNT},
{"znver2", PROCESSOR_ZNVER2, CPU_ZNVER1,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
| PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
| PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
| PTA_SHA | PTA_LZCNT | PTA_POPCNT | PTA_CLWB | PTA_RDPID
| PTA_WBNOINVD},
{"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
@@ -664,11 +664,11 @@ pentium4 pentium4m pentiumpro prescott lakemont"
# 64-bit x86 processors supported by --with-arch=. Each processor
# MUST be separated by exactly one space.
x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
bdver3 bdver4 znver1 btver1 btver2 k8 k8-sse3 opteron opteron-sse3 nocona \
core2 corei7 corei7-avx core-avx-i core-avx2 atom slm nehalem westmere \
sandybridge ivybridge haswell broadwell bonnell silvermont knl knm \
skylake-avx512 cannonlake icelake-client icelake-server skylake goldmont \
goldmont-plus tremont x86-64 native"
bdver3 bdver4 znver1 znver2 btver1 btver2 k8 k8-sse3 opteron \
opteron-sse3 nocona core2 corei7 corei7-avx core-avx-i core-avx2 atom \
slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
skylake goldmont goldmont-plus tremont x86-64 native"

# Additional x86 processors supported by --with-cpu=. Each processor
# MUST be separated by exactly one space.
@@ -3337,6 +3337,10 @@ case ${target} in
arch=znver1
cpu=znver1
;;
znver2-*)
arch=znver2
cpu=znver2
;;
bdver4-*)
arch=bdver4
cpu=bdver4
@@ -3454,6 +3458,10 @@ case ${target} in
arch=znver1
cpu=znver1
;;
znver2-*)
arch=znver2
cpu=znver2
;;
bdver4-*)
arch=bdver4
cpu=bdver4
@@ -649,6 +649,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
processor = PROCESSOR_GEODE;
else if (has_movbe && family == 22)
processor = PROCESSOR_BTVER2;
else if (has_clwb)
processor = PROCESSOR_ZNVER2;
else if (has_clzero)
processor = PROCESSOR_ZNVER1;
else if (has_avx2)
@@ -1012,6 +1014,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
case PROCESSOR_ZNVER1:
cpu = "znver1";
break;
case PROCESSOR_ZNVER2:
cpu = "znver2";
break;
case PROCESSOR_BTVER1:
cpu = "btver1";
break;
@@ -124,6 +124,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__znver1");
def_or_undef (parse_in, "__znver1__");
break;
case PROCESSOR_ZNVER2:
def_or_undef (parse_in, "__znver2");
def_or_undef (parse_in, "__znver2__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__btver1");
def_or_undef (parse_in, "__btver1__");
@@ -288,6 +292,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_ZNVER1:
def_or_undef (parse_in, "__tune_znver1__");
break;
case PROCESSOR_ZNVER2:
def_or_undef (parse_in, "__tune_znver2__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__tune_btver1__");
break;
@@ -169,12 +169,14 @@ const struct processor_costs *ix86_cost = NULL;
#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
#define m_BTVER (m_BTVER1 | m_BTVER2)
#define m_ZNVER (m_ZNVER1 | m_ZNVER2)
#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
| m_ZNVER1)
| m_ZNVER)

#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)

@@ -868,6 +870,7 @@ static const struct processor_costs *processor_cost_table[PROCESSOR_max] =
&btver1_cost,
&btver2_cost,
&znver1_cost,
&znver2_cost
};

static unsigned int
arg_str = "znver1";
priority = P_PROC_AVX2;
break;
case PROCESSOR_ZNVER2:
arg_str = "znver2";
priority = P_PROC_AVX2;
break;
}
}

M_INTEL_COREI7_SKYLAKE_AVX512,
M_INTEL_COREI7_CANNONLAKE,
M_INTEL_COREI7_ICELAKE_CLIENT,
M_INTEL_COREI7_ICELAKE_SERVER
M_INTEL_COREI7_ICELAKE_SERVER,
M_AMDFAM17H_ZNVER2
};

static struct _arch_names_table
{"btver2", M_AMD_BTVER2},
{"amdfam17h", M_AMDFAM17H},
{"znver1", M_AMDFAM17H_ZNVER1},
{"znver2", M_AMDFAM17H_ZNVER2},
};

static struct _isa_names_table

/* Integer vector instructions execute in FP unit
and can execute 3 additions and one multiplication per cycle. */
if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
&& op != PLUS && op != MINUS)
if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2)
&& INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;

/* Account for targets that splits wide vectors into multiple parts. */
@@ -415,6 +415,7 @@ extern const struct processor_costs ix86_size_cost;
#define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1)
#define TARGET_BTVER2 (ix86_tune == PROCESSOR_BTVER2)
#define TARGET_ZNVER1 (ix86_tune == PROCESSOR_ZNVER1)
#define TARGET_ZNVER2 (ix86_tune == PROCESSOR_ZNVER2)

/* Feature tests against the various tunings. */
enum ix86_tune_indices {
@@ -2272,6 +2273,7 @@ enum processor_type
PROCESSOR_BTVER1,
PROCESSOR_BTVER2,
PROCESSOR_ZNVER1,
PROCESSOR_ZNVER2,
PROCESSOR_max
};

@@ -430,7 +430,7 @@
;; Processor type.
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
atom,slm,glm,haswell,generic,amdfam10,bdver1,bdver2,bdver3,
bdver4,btver2,znver1"
bdver4,btver2,znver1,znver2"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
@@ -1273,6 +1273,133 @@ struct processor_costs znver1_cost = {
"16", /* Func alignment. */
};

/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs znver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs znver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};

struct processor_costs znver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
/* Depending on parameters, idiv can get faster on ryzen. This is upper
bound. */
{COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (22), /* HI. */
COSTS_N_INSNS (30), /* SI. */
COSTS_N_INSNS (45), /* DI. */
COSTS_N_INSNS (45)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */

/* All move costs are relative to integer->integer move times 2 and thus
they are latency*2. */

/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Because reg-reg move cost is 2 and following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 3, 6, /* cost of moving XMM,YMM,ZMM
register. */
{6, 6, 6, 10, 20}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
6, 6, /* SSE->integer and integer->SSE
moves. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
18, 8, /* Gather load static, per_elt. */
18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */

COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13. */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};

/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
@@ -64,6 +64,7 @@ ix86_issue_rate (void)
case PROCESSOR_BDVER3:
case PROCESSOR_BDVER4:
case PROCESSOR_ZNVER1:
case PROCESSOR_ZNVER2:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
@@ -393,6 +394,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
break;

case PROCESSOR_ZNVER1:
case PROCESSOR_ZNVER2:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))

0 comments on commit 703d2f6

Please sign in to comment.
You can’t perform that action at this time.