From fe376dcd53c63856c8d535053728bfb3f35ddcce Mon Sep 17 00:00:00 2001 From: Dean Michael Berris Date: Thu, 30 Nov 2017 05:35:51 +0000 Subject: [PATCH 001/165] [XRay][docs] Update documentation on new default for xray_naive_log= We've recently changed the default for `xray_naive_log=` to be `false` instead of `true` to make it more consistent with the FDR mode logging implementation. This means we will now ask users to explicitly choose which version of the XRay logging is being used. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319400 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/XRay.rst | 9 ++++++--- docs/XRayExample.rst | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/XRay.rst b/docs/XRay.rst index e9ecc13e3b286..b4443c4d8060a 100644 --- a/docs/XRay.rst +++ b/docs/XRay.rst @@ -143,7 +143,7 @@ variable, where we list down the options and their defaults below. | | | | instrumentation points | | | | | before main. | +-------------------+-----------------+---------------+------------------------+ -| xray_naive_log | ``bool`` | ``true`` | Whether to install | +| xray_naive_log | ``bool`` | ``false`` | Whether to install | | | | | the naive log | | | | | implementation. | +-------------------+-----------------+---------------+------------------------+ @@ -258,8 +258,11 @@ supports the following subcommands: - ``account``: Performs basic function call accounting statistics with various options for sorting, and output formats (supports CSV, YAML, and console-friendly TEXT). -- ``convert``: Converts an XRay log file from one format to another. Currently - only converts to YAML. +- ``convert``: Converts an XRay log file from one format to another. We can + convert from binary XRay traces (both naive and FDR mode) to YAML, + `flame-graph `_ friendly text + formats, as well as `Chrome Trace Viewer (catapult) + ` formats. - ``graph``: Generates a DOT graph of the function call relationships between functions found in an XRay trace. - ``stack``: Reconstructs function call stacks from a timeline of function diff --git a/docs/XRayExample.rst b/docs/XRayExample.rst index 56f17507d82f1..953833bc1ef65 100644 --- a/docs/XRayExample.rst +++ b/docs/XRayExample.rst @@ -60,7 +60,7 @@ to enable XRay at application start. To do this, XRay checks the $ ./bin/llc input.ll # We need to set the XRAY_OPTIONS to enable some features. - $ XRAY_OPTIONS="patch_premain=true" ./bin/llc input.ll + $ XRAY_OPTIONS="patch_premain=true xray_naive_log=true" ./bin/llc input.ll ==69819==XRay: Log file in 'xray-log.llc.m35qPB' At this point we now have an XRay trace we can start analysing. From d60f6e4a5b7560f3b6f7abfe0edb527467f0c997 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 30 Nov 2017 06:31:31 +0000 Subject: [PATCH 002/165] [X86] Make sure we don't remove sign extends of masks with AVX2 masked gathers. We don't use k-registers and instead use the MSB so we need to make sure we sign extend the mask to the msb. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319405 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 7 ++-- test/CodeGen/X86/avx2-masked-gather.ll | 52 ++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 36c284b0bc180..110af66b42229 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35924,7 +35924,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); // Pre-shrink oversized index elements to avoid triggering scalarization. @@ -35967,7 +35968,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, // the masks is v*i1. So the mask will be truncated anyway. // The SIGN_EXTEND_INREG my be dropped. SDValue Mask = N->getOperand(2); - if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); @@ -37079,7 +37080,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMADDSUB: case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); diff --git a/test/CodeGen/X86/avx2-masked-gather.ll b/test/CodeGen/X86/avx2-masked-gather.ll index 436fb775775de..bf5ab1657a579 100644 --- a/test/CodeGen/X86/avx2-masked-gather.ll +++ b/test/CodeGen/X86/avx2-masked-gather.ll @@ -9,6 +9,9 @@ declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32: ; X86: # BB#0: # %entry +; X86-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -19,6 +22,9 @@ define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i3 ; ; X64-LABEL: masked_gather_v2i32: ; X64: # BB#0: # %entry +; X64-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-NEXT: vmovdqa (%rdi), %xmm2 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -57,6 +63,9 @@ entry: define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { ; X86-LABEL: masked_gather_v2i32_concat: ; X86: # BB#0: # %entry +; X86-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -68,6 +77,9 @@ define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, ; ; X64-LABEL: masked_gather_v2i32_concat: ; X64: # BB#0: # %entry +; X64-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-NEXT: vmovdqa (%rdi), %xmm2 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -112,6 +124,8 @@ define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, < ; X86-LABEL: masked_gather_v2float: ; X86: # BB#0: # %entry ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1 @@ -121,6 +135,8 @@ define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, < ; X64-LABEL: masked_gather_v2float: ; X64: # BB#0: # %entry ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 ; X64-NEXT: vmovaps (%rdi), %xmm2 ; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 @@ -159,6 +175,8 @@ define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %m ; X86-LABEL: masked_gather_v2float_concat: ; X86: # BB#0: # %entry ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1 @@ -168,6 +186,8 @@ define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %m ; X64-LABEL: masked_gather_v2float_concat: ; X64: # BB#0: # %entry ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 ; X64-NEXT: vmovaps (%rdi), %xmm2 ; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 @@ -209,12 +229,16 @@ declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) { ; X86-LABEL: masked_gather_v4i32: ; X86: # BB#0: # %entry +; X86-NEXT: vpslld $31, %xmm1, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm1 ; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2 ; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4i32: ; X64: # BB#0: # %entry +; X64-NEXT: vpslld $31, %xmm1, %xmm1 +; X64-NEXT: vpsrad $31, %xmm1, %xmm1 ; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2 ; X64-NEXT: vmovdqa %xmm2, %xmm0 ; X64-NEXT: vzeroupper @@ -267,12 +291,16 @@ declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) { ; X86-LABEL: masked_gather_v4float: ; X86: # BB#0: # %entry +; X86-NEXT: vpslld $31, %xmm1, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm1 ; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2 ; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4float: ; X64: # BB#0: # %entry +; X64-NEXT: vpslld $31, %xmm1, %xmm1 +; X64-NEXT: vpsrad $31, %xmm1, %xmm1 ; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2 ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: vzeroupper @@ -326,6 +354,8 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3 ; X86-LABEL: masked_gather_v8i32: ; X86: # BB#0: # %entry ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: vpslld $31, %ymm0, %ymm0 +; X86-NEXT: vpsrad $31, %ymm0, %ymm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqa (%eax), %ymm2 ; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1 @@ -441,6 +471,8 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, < ; X86-LABEL: masked_gather_v8float: ; X86: # BB#0: # %entry ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: vpslld $31, %ymm0, %ymm0 +; X86-NEXT: vpsrad $31, %ymm0, %ymm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%eax), %ymm2 ; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1 @@ -710,14 +742,20 @@ declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { ; X86-LABEL: masked_gather_v2i64: ; X86: # BB#0: # %entry +; X86-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpgatherqq %xmm0, (,%xmm3), %xmm1 ; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i64: ; X64: # BB#0: # %entry +; X64-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-NEXT: vmovdqa (%rdi), %xmm2 ; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 ; X64-NEXT: vmovdqa %xmm1, %xmm0 @@ -755,14 +793,20 @@ declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %alig define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { ; X86-LABEL: masked_gather_v2double: ; X86: # BB#0: # %entry +; X86-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; X86-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; X86-NEXT: vgatherqpd %xmm0, (,%xmm3), %xmm1 ; X86-NEXT: vmovapd %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2double: ; X64: # BB#0: # %entry +; X64-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-NEXT: vmovapd (%rdi), %xmm2 ; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 ; X64-NEXT: vmovapd %xmm1, %xmm0 From f6395de02662b7a3efa59d1a7ad6356171e8098a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 30 Nov 2017 07:01:40 +0000 Subject: [PATCH 003/165] [X86] Optimize avx2 vgatherqps for v2f32 with v2i64 index type. Normal type legalization will widen everything. This requires forcing 0s into the mask register. We can instead choose the form that only reads 2 elements without zeroing the mask. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319406 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 19 ++++++++++++------- test/CodeGen/X86/avx2-masked-gather.ll | 14 ++++++-------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 110af66b42229..a21145f07556c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1127,6 +1127,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (HasInt256) { + // Custom legalize 2x32 to get a little better code. + setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); @@ -1360,11 +1363,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Legal); } - // Custom legalize 2x32 to get a little better code. - if (Subtarget.hasVLX()) { - setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); - } - // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) @@ -24863,7 +24861,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::MGATHER: { EVT VT = N->getValueType(0); - if (VT == MVT::v2f32 && Subtarget.hasVLX()) { + if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) @@ -24873,10 +24871,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Gather->getValue(), DAG.getUNDEF(MVT::v2f32)); + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); + } SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index }; SDValue Res = DAG.getTargetMemSDNode( - DAG.getVTList(MVT::v4f32, MVT::v2i1, MVT::Other), Ops, dl, + DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); diff --git a/test/CodeGen/X86/avx2-masked-gather.ll b/test/CodeGen/X86/avx2-masked-gather.ll index bf5ab1657a579..2007b7cf76eba 100644 --- a/test/CodeGen/X86/avx2-masked-gather.ll +++ b/test/CodeGen/X86/avx2-masked-gather.ll @@ -134,13 +134,12 @@ define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, < ; ; X64-LABEL: masked_gather_v2float: ; X64: # BB#0: # %entry -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpsrad $31, %xmm0, %xmm0 -; X64-NEXT: vmovaps (%rdi), %xmm2 -; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 -; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2float: @@ -185,13 +184,12 @@ define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %m ; ; X64-LABEL: masked_gather_v2float_concat: ; X64: # BB#0: # %entry -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vpsrad $31, %xmm0, %xmm0 -; X64-NEXT: vmovaps (%rdi), %xmm2 -; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 -; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_v2float_concat: From 39be023c86c10a935045a3b98dd978b182815375 Mon Sep 17 00:00:00 2001 From: Hiroshi Inoue Date: Thu, 30 Nov 2017 07:44:46 +0000 Subject: [PATCH 004/165] [SROA] enable splitting for non-whole-alloca loads and stores Currently, SROA splits loads and stores only when they are accessing the whole alloca. This patch relaxes this limitation to allow splitting a load/store if all other loads and stores to the alloca are disjoint to or fully included in the current load/store. If there is no other load or store that crosses the boundary of the current load/store, the current splitting implementation works as is. The whole-alloca loads and stores meet this new condition and so they are still splittable. Here is a simplified motivating example. struct record { long long a; int b; int c; }; int func(struct record r) { for (int i = 0; i < r.c; i++) r.b++; return r.b; } When updating r.b (or r.c as well), LLVM generates redundant instructions on some platforms (such as x86_64, ppc64); here, r.b and r.c are packed into one 64-bit GPR when the struct is passed as a method argument. With this patch, the above example is compiled into only few instructions without loop. Without the patch, unnecessary loop-carried dependency is introduced by SROA and the loop cannot be eliminated by the later optimizers. Differential Revision: https://reviews.llvm.org/D32998 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319407 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/SROA.cpp | 31 +++++++++++++++-------- test/DebugInfo/X86/sroasplit-2.ll | 3 ++- test/Transforms/SROA/basictest.ll | 33 +++++++++++++++++++++--- test/Transforms/SROA/big-endian.ll | 40 +++++++++++++++++++++--------- 4 files changed, 80 insertions(+), 27 deletions(-) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index bd064978b645d..d0431d48a4370 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -4047,21 +4048,31 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); - // Now that we have identified any pre-splitting opportunities, mark any - // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail - // to split these during pre-splitting, we want to force them to be - // rewritten into a partition. + // Now that we have identified any pre-splitting opportunities, + // mark loads and stores unsplittable except for the following case. + // We leave a slice splittable if all other slices are disjoint or fully + // included in the slice, such as whole-alloca loads and stores. + // If we fail to split these during pre-splitting, we want to force them + // to be rewritten into a partition. bool IsSorted = true; + + // If a byte boundary is included in any load or store, a slice starting or + // ending at the boundary is not splittable. + unsigned AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType()); + SmallBitVector SplittableOffset(AllocaSize+1, true); + for (Slice &S : AS) + for (unsigned O = S.beginOffset() + 1; O < S.endOffset() && O < AllocaSize; + O++) + SplittableOffset.reset(O); + for (Slice &S : AS) { if (!S.isSplittable()) continue; - // FIXME: We currently leave whole-alloca splittable loads and stores. This - // used to be the only splittable loads and stores and we need to be - // confident that the above handling of splittable loads and stores is - // completely sufficient before we forcibly disable the remaining handling. - if (S.beginOffset() == 0 && - S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) + + if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && + (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) continue; + if (isa(S.getUse()->getUser()) || isa(S.getUse()->getUser())) { S.makeUnsplittable(); diff --git a/test/DebugInfo/X86/sroasplit-2.ll b/test/DebugInfo/X86/sroasplit-2.ll index b2bec7cede0e3..3e99ec1e16a59 100644 --- a/test/DebugInfo/X86/sroasplit-2.ll +++ b/test/DebugInfo/X86/sroasplit-2.ll @@ -21,7 +21,8 @@ ; Verify that SROA creates a variable piece when splitting i1. ; CHECK: call void @llvm.dbg.value(metadata i64 %outer.coerce0, metadata ![[O:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), -; CHECK: call void @llvm.dbg.value(metadata i64 %outer.coerce1, metadata ![[O]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), +; CHECK: call void @llvm.dbg.value(metadata i32 {{.*}}, metadata ![[O]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32)), +; CHECK: call void @llvm.dbg.value(metadata i32 {{.*}}, metadata ![[O]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32)), ; CHECK: call void @llvm.dbg.value({{.*}}, metadata ![[I1:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32)), ; CHECK-DAG: ![[O]] = !DILocalVariable(name: "outer",{{.*}} line: 10 ; CHECK-DAG: ![[I1]] = !DILocalVariable(name: "i1",{{.*}} line: 11 diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll index aa00e89ea04f0..9cf21910a5f37 100644 --- a/test/Transforms/SROA/basictest.ll +++ b/test/Transforms/SROA/basictest.ll @@ -1615,13 +1615,13 @@ define i16 @PR24463() { ; Ensure we can handle a very interesting case where there is an integer-based ; rewrite of the uses of the alloca, but where one of the integers in that is ; a sub-integer that requires extraction *and* extends past the end of the -; alloca. In this case, we should extract the i8 and then zext it to i16. +; alloca. SROA can split the alloca to avoid shift or trunc. ; ; CHECK-LABEL: @PR24463( ; CHECK-NOT: alloca -; CHECK: %[[SHIFT:.*]] = lshr i16 0, 8 -; CHECK: %[[TRUNC:.*]] = trunc i16 %[[SHIFT]] to i8 -; CHECK: %[[ZEXT:.*]] = zext i8 %[[TRUNC]] to i16 +; CHECK-NOT: trunc +; CHECK-NOT: lshr +; CHECK: %[[ZEXT:.*]] = zext i8 {{.*}} to i16 ; CHECK: ret i16 %[[ZEXT]] entry: %alloca = alloca [3 x i8] @@ -1695,3 +1695,28 @@ bb1: call void @llvm.lifetime.end.p0i8(i64 2, i8* %0) ret void } + +define void @test28(i64 %v) #0 { +; SROA should split the first i64 store to avoid additional and/or instructions +; when storing into i32 fields + +; CHECK-LABEL: @test28( +; CHECK-NOT: alloca +; CHECK-NOT: and +; CHECK-NOT: or +; CHECK: %[[shift:.*]] = lshr i64 %v, 32 +; CHECK-NEXT: %{{.*}} = trunc i64 %[[shift]] to i32 +; CHECK-NEXT: ret void + +entry: + %t = alloca { i64, i32, i32 } + + %b = getelementptr { i64, i32, i32 }, { i64, i32, i32 }* %t, i32 0, i32 1 + %0 = bitcast i32* %b to i64* + store i64 %v, i64* %0 + + %1 = load i32, i32* %b + %c = getelementptr { i64, i32, i32 }, { i64, i32, i32 }* %t, i32 0, i32 2 + store i32 %1, i32* %c + ret void +} diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll index ea41a20fd38e2..fc4b8b2885561 100644 --- a/test/Transforms/SROA/big-endian.ll +++ b/test/Transforms/SROA/big-endian.ll @@ -83,19 +83,34 @@ entry: store i16 1, i16* %a0i16ptr store i8 1, i8* %a2ptr -; CHECK: %[[mask1:.*]] = and i40 undef, 4294967295 -; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296 %a3i24ptr = bitcast i8* %a3ptr to i24* store i24 1, i24* %a3i24ptr -; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041 -; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256 %a2i40ptr = bitcast i8* %a2ptr to i40* store i40 1, i40* %a2i40ptr -; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56 -; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776 -; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]] + +; the alloca is splitted into multiple slices +; Here, i8 1 is for %a[6] +; CHECK: %[[ext1:.*]] = zext i8 1 to i40 +; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, -256 +; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], %[[ext1]] + +; Here, i24 0 is for %a[3] to %a[5] +; CHECK-NEXT: %[[ext2:.*]] = zext i24 0 to i40 +; CHECK-NEXT: %[[shift2:.*]] = shl i40 %[[ext2]], 8 +; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041 +; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], %[[shift2]] + +; Here, i8 0 is for %a[2] +; CHECK-NEXT: %[[ext3:.*]] = zext i8 0 to i40 +; CHECK-NEXT: %[[shift3:.*]] = shl i40 %[[ext3]], 32 +; CHECK-NEXT: %[[mask3:.*]] = and i40 %[[insert2]], 4294967295 +; CHECK-NEXT: %[[insert3:.*]] = or i40 %[[mask3]], %[[shift3]] + +; CHECK-NEXT: %[[ext4:.*]] = zext i40 %[[insert3]] to i56 +; CHECK-NEXT: %[[mask4:.*]] = and i56 undef, -1099511627776 +; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[ext4]] ; CHECK-NOT: store ; CHECK-NOT: load @@ -104,11 +119,12 @@ entry: %ai = load i56, i56* %aiptr %ret = zext i56 %ai to i64 ret i64 %ret -; CHECK-NEXT: %[[ext4:.*]] = zext i16 1 to i56 -; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40 -; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775 -; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]] -; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64 +; Here, i16 1 is for %a[0] to %a[1] +; CHECK-NEXT: %[[ext5:.*]] = zext i16 1 to i56 +; CHECK-NEXT: %[[shift5:.*]] = shl i56 %[[ext5]], 40 +; CHECK-NEXT: %[[mask5:.*]] = and i56 %[[insert4]], 1099511627775 +; CHECK-NEXT: %[[insert5:.*]] = or i56 %[[mask5]], %[[shift5]] +; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert5]] to i64 ; CHECK-NEXT: ret i64 %[[ret]] } From 431a76fee482d5d2463d9acfd6ccc70dacb64bcd Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Thu, 30 Nov 2017 08:18:50 +0000 Subject: [PATCH 005/165] [SystemZ] Bugfix in adjustSubwordCmp. Csmith generated a program where a store after load to the same address did not get chained after the new load created during DAG legalizing, and so performed an illegal overwrite of the expected value. When the new zero-extending load is created, the chain users of the original load must be updated, which was not done previously. A similar case was also found and handled in lowerBITCAST. Review: Ulrich Weigand https://reviews.llvm.org/D40542 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319409 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/SystemZ/SystemZISelLowering.cpp | 15 +- test/CodeGen/SystemZ/dag-combine-02.ll | 192 +++++++++++++++++++++ 2 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/SystemZ/dag-combine-02.ll diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index d49d7316e682b..ad14e5e34e2ec 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1844,11 +1844,14 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL, ISD::SEXTLOAD : ISD::ZEXTLOAD); if (C.Op0.getValueType() != MVT::i32 || - Load->getExtensionType() != ExtType) + Load->getExtensionType() != ExtType) { C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(), Load->getBasePtr(), Load->getPointerInfo(), Load->getMemoryVT(), Load->getAlignment(), Load->getMemOperand()->getFlags()); + // Update the chain uses. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1)); + } // Make sure that the second operand is an i32 with the right value. if (C.Op1.getValueType() != MVT::i32 || @@ -2940,9 +2943,13 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op, // but we need this case for bitcasts that are created during lowering // and which are then lowered themselves. if (auto *LoadN = dyn_cast(In)) - if (ISD::isNormalLoad(LoadN)) - return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(), - LoadN->getMemOperand()); + if (ISD::isNormalLoad(LoadN)) { + SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(), + LoadN->getBasePtr(), LoadN->getMemOperand()); + // Update the chain uses. + DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1)); + return NewLoad; + } if (InVT == MVT::i32 && ResVT == MVT::f32) { SDValue In64; diff --git a/test/CodeGen/SystemZ/dag-combine-02.ll b/test/CodeGen/SystemZ/dag-combine-02.ll new file mode 100644 index 0000000000000..b20133facb895 --- /dev/null +++ b/test/CodeGen/SystemZ/dag-combine-02.ll @@ -0,0 +1,192 @@ +; Test that adjustSubwordCmp() maintains the chains properly when creating a +; new extending load. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -O3 | FileCheck %s + +@g_56 = external hidden unnamed_addr global i64, align 8 +@func_22.l_91 = external hidden unnamed_addr constant [4 x [7 x i16*]], align 8 +@g_102 = external hidden unnamed_addr global i16**, align 8 +@.str = external hidden unnamed_addr constant [2 x i8], align 2 +@.str.1 = external hidden unnamed_addr constant [15 x i8], align 2 +@crc32_context = external hidden unnamed_addr global i32, align 4 +@crc32_tab = external hidden unnamed_addr global [256 x i32], align 4 +@.str.2 = external hidden unnamed_addr constant [36 x i8], align 2 +@.str.3 = external hidden unnamed_addr constant [15 x i8], align 2 +@g_181.0.4.5 = external hidden unnamed_addr global i1, align 2 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +define signext i32 @main(i32 signext, i8** nocapture readonly) local_unnamed_addr #1 { + %3 = alloca [4 x [7 x i16*]], align 8 + %4 = icmp eq i32 %0, 2 + br i1 %4, label %5, label %11 + +;