From fe809519b1bb88172f39a8c7730bbf3403c35150 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 04:49:21 +0000 Subject: [PATCH 01/21] Merging r340927: ------------------------------------------------------------------------ r340927 | vstefanovic | 2018-08-29 07:07:14 -0700 (Wed, 29 Aug 2018) | 14 lines [mips] Prevent shrink-wrap for BuildPairF64, ExtractElementF64 when they use $sp For a certain combination of options, BuildPairF64_{64}, ExtractElementF64{_64} may be expanded into instructions using stack. Add implicit operand $sp for such cases so that ShrinkWrapping doesn't move prologue setup below them. Fixes MultiSource/Benchmarks/MallocBench/cfrac for '--target=mips-img-linux-gnu -mcpu=mips32r6 -mfpxx -mnan=2008' and '--target=mips-img-linux-gnu -mcpu=mips32r6 -mfp64 -mnan=2008 -mno-odd-spreg'. Differential Revision: https://reviews.llvm.org/D50986 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346734 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsSEFrameLowering.cpp | 17 +- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 12 ++ ...ldpairf64-extractelementf64-implicit-sp.ll | 32 ++++ ...nk-wrap-buildpairf64-extractelementf64.mir | 150 ++++++++++++++++++ 4 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 test/CodeGen/Mips/buildpairf64-extractelementf64-implicit-sp.ll create mode 100644 test/CodeGen/Mips/shrink-wrap-buildpairf64-extractelementf64.mir diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 687c9f676b340..ef1b3c09bdc4c 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -299,8 +299,12 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, // register). Unfortunately, we have to make this decision before register // allocation so for now we use a spill/reload sequence for all // double-precision values in regardless of being an odd/even register. - if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || - (FP64 && !Subtarget.useOddSPReg())) { + // + // For the cases that should be covered here MipsSEISelDAGToDAG adds $sp as + // implicit operand, so other passes (like ShrinkWrapping) are aware that + // stack is used. + if (I->getNumOperands() == 4 && I->getOperand(3).isReg() + && I->getOperand(3).getReg() == Mips::SP) { unsigned DstReg = I->getOperand(0).getReg(); unsigned LoReg = I->getOperand(1).getReg(); unsigned HiReg = I->getOperand(2).getReg(); @@ -360,9 +364,12 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // register). Unfortunately, we have to make this decision before register // allocation so for now we use a spill/reload sequence for all // double-precision values in regardless of being an odd/even register. - - if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || - (FP64 && !Subtarget.useOddSPReg())) { + // + // For the cases that should be covered here MipsSEISelDAGToDAG adds $sp as + // implicit operand, so other passes (like ShrinkWrapping) are aware that + // stack is used. + if (I->getNumOperands() == 4 && I->getOperand(3).isReg() + && I->getOperand(3).getReg() == Mips::SP) { unsigned DstReg = I->getOperand(0).getReg(); unsigned SrcReg = Op1.getReg(); unsigned N = Op2.getImm(); diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 599c1e913acf0..cf2899dd375e9 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -238,6 +238,18 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { case Mips::WRDSP: addDSPCtrlRegOperands(true, MI, MF); break; + case Mips::BuildPairF64_64: + case Mips::ExtractElementF64_64: + if (!Subtarget->useOddSPReg()) { + MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true)); + break; + } + // fallthrough + case Mips::BuildPairF64: + case Mips::ExtractElementF64: + if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1()) + MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true)); + break; default: replaceUsesWithZeroReg(MRI, MI); } diff --git a/test/CodeGen/Mips/buildpairf64-extractelementf64-implicit-sp.ll b/test/CodeGen/Mips/buildpairf64-extractelementf64-implicit-sp.ll new file mode 100644 index 0000000000000..7847fc89371b8 --- /dev/null +++ b/test/CodeGen/Mips/buildpairf64-extractelementf64-implicit-sp.ll @@ -0,0 +1,32 @@ +; RUN: llc -o - %s -mtriple=mips-unknown-linux-gnu \ +; RUN: -mcpu=mips32 -mattr=+fpxx \ +; RUN: -stop-after=expand-isel-pseudos | \ +; RUN: FileCheck %s -check-prefix=FPXX-IMPLICIT-SP + +; RUN: llc -o - %s -mtriple=mips-unknown-linux-gnu \ +; RUN: -mcpu=mips32r6 -mattr=+fp64,+nooddspreg \ +; RUN: -stop-after=expand-isel-pseudos | \ +; RUN: FileCheck %s -check-prefix=FP64-IMPLICIT-SP + +; RUN: llc -o - %s -mtriple=mips-unknown-linux-gnu \ +; RUN: -mcpu=mips32r2 -mattr=+fpxx \ +; RUN: -stop-after=expand-isel-pseudos | \ +; RUN: FileCheck %s -check-prefix=NO-IMPLICIT-SP + +define double @foo2(i32 signext %v1, double %d1) { +entry: +; FPXX-IMPLICIT-SP: BuildPairF64 %{{[0-9]+}}, %{{[0-9]+}}, implicit $sp +; FPXX-IMPLICIT-SP: ExtractElementF64 killed %{{[0-9]+}}, 1, implicit $sp +; FP64-IMPLICIT-SP: BuildPairF64_64 %{{[0-9]+}}, %{{[0-9]+}}, implicit $sp +; FP64-IMPLICIT-SP: ExtractElementF64_64 killed %{{[0-9]+}}, 1, implicit $sp +; NO-IMPLICIT-SP: BuildPairF64 %{{[0-9]+}}, %{{[0-9]+}} +; NO-IMPLICIT-SP-NOT: BuildPairF64 %{{[0-9]+}}, %{{[0-9]+}}, implicit $sp +; NO-IMPLICIT-SP: ExtractElementF64 killed %{{[0-9]+}}, 1 +; NO-IMPLICIT-SP-NOT: ExtractElementF64 killed %{{[0-9]+}}, 1, implicit $sp + %conv = fptrunc double %d1 to float + %0 = tail call float @llvm.copysign.f32(float 1.000000e+00, float %conv) + %conv1 = fpext float %0 to double + ret double %conv1 +} + +declare float @llvm.copysign.f32(float, float) diff --git a/test/CodeGen/Mips/shrink-wrap-buildpairf64-extractelementf64.mir b/test/CodeGen/Mips/shrink-wrap-buildpairf64-extractelementf64.mir new file mode 100644 index 0000000000000..cb364de0e9bc0 --- /dev/null +++ b/test/CodeGen/Mips/shrink-wrap-buildpairf64-extractelementf64.mir @@ -0,0 +1,150 @@ +# RUN: llc -o - %s -mtriple=mips-unknown-linux-gnu -enable-shrink-wrap=true \ +# RUN: -start-before=shrink-wrap -stop-after=prologepilog | FileCheck %s + +--- | + declare void @foo() + define void @testBuildPairF64() { + ret void + } + define void @testBuildPairF64_64() { + ret void + } + define void @testBuildPairF64implicitSp() { + ret void + } + define void @testBuildPairF64_64implicitSp() { + ret void + } + define void @testExtractElementF64() { + ret void + } + define void @testExtractElementF64_64() { + ret void + } + define void @testExtractElementF64implicitSp() { + ret void + } + define void @testExtractElementF64_64implicitSp() { + ret void + } +... +--- +name: testBuildPairF64 +# CHECK-LABEL: name: testBuildPairF64 +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: BuildPairF64 +body: | + bb.0: + $d0 = BuildPairF64 $zero, $zero + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testBuildPairF64_64 +# CHECK-LABEL: name: testBuildPairF64_64 +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: BuildPairF64_64 +body: | + bb.0: + $d0_64 = BuildPairF64_64 $zero, $zero + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testBuildPairF64implicitSp +# CHECK-LABEL: name: testBuildPairF64implicitSp +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: $sp = ADDiu $sp, -{{[0-9]+}} +body: | + bb.0: + $d0 = BuildPairF64 $zero, $zero, implicit $sp + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testBuildPairF64_64implicitSp +# CHECK-LABEL: name: testBuildPairF64_64implicitSp +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: $sp = ADDiu $sp, -{{[0-9]+}} +body: | + bb.0: + $d0_64 = BuildPairF64_64 $zero, $zero, implicit $sp + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testExtractElementF64 +# CHECK-LABEL: name: testExtractElementF64 +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: ExtractElementF64 +body: | + bb.0: + $at = ExtractElementF64 $d6, 1 + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testExtractElementF64_64 +# CHECK-LABEL: name: testExtractElementF64_64 +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: ExtractElementF64_64 +body: | + bb.0: + $at = ExtractElementF64_64 $d12_64, 1 + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testExtractElementF64implicitSp +# CHECK-LABEL: name: testExtractElementF64implicitSp +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: $sp = ADDiu $sp, -{{[0-9]+}} +body: | + bb.0: + $at = ExtractElementF64 $d6, 1, implicit $sp + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... +--- +name: testExtractElementF64_64implicitSp +# CHECK-LABEL: name: testExtractElementF64_64implicitSp +# CHECK: bb.0 +# CHECK-NEXT: successors +# CHECK-NEXT: {{[[:space:]]$}} +# CHECK-NEXT: $sp = ADDiu $sp, -{{[0-9]+}} +body: | + bb.0: + $at = ExtractElementF64_64 $d12_64, 1, implicit $sp + bb.1: + JAL @foo, implicit-def $ra + bb.2: + RetRA +... From b023def3de61558cb62c21dddae701a7c83d6ea0 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 05:02:15 +0000 Subject: [PATCH 02/21] Merging r340931: ------------------------------------------------------------------------ r340931 | atanasyan | 2018-08-29 07:53:55 -0700 (Wed, 29 Aug 2018) | 6 lines [mips] Involves microMIPS's jump in the analyzable branch set Involves microMIPS's jump in the analyzable branch set to reduce some code patterns. Differential revision: https://reviews.llvm.org/D50613 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346735 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsSEInstrInfo.cpp | 2 +- test/CodeGen/Mips/longbranch.ll | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 7ffe4aff474d8..9f697cd03cbb1 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -643,7 +643,7 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const { Opc == Mips::BNE64 || Opc == Mips::BGTZ64 || Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 || Opc == Mips::BC1T || Opc == Mips::BC1F || Opc == Mips::B || Opc == Mips::J || - Opc == Mips::B_MM || Opc == Mips::BEQZC_MM || + Opc == Mips::J_MM || Opc == Mips::B_MM || Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM || Opc == Mips::BEQC || Opc == Mips::BNEC || Opc == Mips::BLTC || Opc == Mips::BGEC || Opc == Mips::BLTUC || Opc == Mips::BGEUC || Opc == Mips::BGTZC || Opc == Mips::BLEZC || diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll index 74cd1fe3aa4f7..43eaa50daf050 100644 --- a/test/CodeGen/Mips/longbranch.ll +++ b/test/CodeGen/Mips/longbranch.ll @@ -231,16 +231,13 @@ define void @test1(i32 signext %s) { ; MICROMIPSSTATIC: # %bb.0: # %entry ; MICROMIPSSTATIC-NEXT: bnezc $4, $BB0_2 ; MICROMIPSSTATIC-NEXT: # %bb.1: # %entry -; MICROMIPSSTATIC-NEXT: j $BB0_4 -; MICROMIPSSTATIC-NEXT: nop -; MICROMIPSSTATIC-NEXT: $BB0_2: # %entry ; MICROMIPSSTATIC-NEXT: j $BB0_3 ; MICROMIPSSTATIC-NEXT: nop -; MICROMIPSSTATIC-NEXT: $BB0_3: # %then +; MICROMIPSSTATIC-NEXT: $BB0_2: # %then ; MICROMIPSSTATIC-NEXT: lui $1, %hi(x) ; MICROMIPSSTATIC-NEXT: li16 $2, 1 ; MICROMIPSSTATIC-NEXT: sw $2, %lo(x)($1) -; MICROMIPSSTATIC-NEXT: $BB0_4: # %end +; MICROMIPSSTATIC-NEXT: $BB0_3: # %end ; MICROMIPSSTATIC-NEXT: jrc $ra ; ; MICROMIPSR6STATIC-LABEL: test1: From aaf6ddfa1c79c9610bc35f208514a516d2cbbd5b Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 05:19:01 +0000 Subject: [PATCH 03/21] Merging r340932: ------------------------------------------------------------------------ r340932 | atanasyan | 2018-08-29 07:54:01 -0700 (Wed, 29 Aug 2018) | 11 lines [mips] Fix microMIPS unconditional branch offset handling MipsSEInstrInfo class defines for internal purpose unconditional branches as Mips::B nad Mips:J even in case of microMIPS code generation. Under some conditions that leads to the bug - for rather long branch which fits to Mips jump instruction offset size, but does not fit to microMIPS jump offset size, we generate 'short' branch and later show an error 'out of range PC16 fixup' after check in the isBranchOffsetInRange routine. Differential revision: https://reviews.llvm.org/D50615 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346736 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsSEInstrInfo.cpp | 9 ++- test/CodeGen/Mips/micromips-b-range.ll | 98 ++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/Mips/micromips-b-range.ll diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 9f697cd03cbb1..e8589fc53492b 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -25,9 +25,14 @@ using namespace llvm; +static unsigned getUnconditionalBranch(const MipsSubtarget &STI) { + if (STI.inMicroMipsMode()) + return STI.isPositionIndependent() ? Mips::B_MM : Mips::J_MM; + return STI.isPositionIndependent() ? Mips::B : Mips::J; +} + MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, STI.isPositionIndependent() ? Mips::B : Mips::J), - RI() {} + : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI() {} const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { return RI; diff --git a/test/CodeGen/Mips/micromips-b-range.ll b/test/CodeGen/Mips/micromips-b-range.ll new file mode 100644 index 0000000000000..f761d1c31d320 --- /dev/null +++ b/test/CodeGen/Mips/micromips-b-range.ll @@ -0,0 +1,98 @@ +; RUN: llc -march=mips -relocation-model=pic -mattr=+micromips \ +; RUN: -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s + +; CHECK-LABEL: foo: +; CHECK-NEXT: 0: 41 a2 00 00 lui $2, 0 +; CHECK-NEXT: 4: 30 42 00 00 addiu $2, $2, 0 +; CHECK-NEXT: 8: 03 22 11 50 addu $2, $2, $25 +; CHECK-NEXT: c: fc 42 00 00 lw $2, 0($2) +; CHECK-NEXT: 10: 69 20 lw16 $2, 0($2) +; CHECK-NEXT: 12: 40 c2 00 14 bgtz $2, 44 +; CHECK-NEXT: 16: 00 00 00 00 nop +; CHECK-NEXT: 1a: 33 bd ff f8 addiu $sp, $sp, -8 +; CHECK-NEXT: 1e: fb fd 00 00 sw $ra, 0($sp) +; CHECK-NEXT: 22: 41 a1 00 01 lui $1, 1 +; CHECK-NEXT: 26: 40 60 00 02 bal 8 +; CHECK-NEXT: 2a: 30 21 04 68 addiu $1, $1, 1128 +; CHECK-NEXT: 2e: 00 3f 09 50 addu $1, $ra, $1 +; CHECK-NEXT: 32: ff fd 00 00 lw $ra, 0($sp) +; CHECK-NEXT: 36: 00 01 0f 3c jr $1 +; CHECK-NEXT: 3a: 33 bd 00 08 addiu $sp, $sp, 8 +; CHECK-NEXT: 3e: 94 00 00 02 b 8 +; CHECK-NEXT: 42: 00 00 00 00 nop +; CHECK-NEXT: 46: 30 20 4e 1f addiu $1, $zero, 19999 +; CHECK-NEXT: 4a: b4 22 00 14 bne $2, $1, 44 +; CHECK-NEXT: 4e: 00 00 00 00 nop +; CHECK-NEXT: 52: 33 bd ff f8 addiu $sp, $sp, -8 +; CHECK-NEXT: 56: fb fd 00 00 sw $ra, 0($sp) +; CHECK-NEXT: 5a: 41 a1 00 01 lui $1, 1 +; CHECK-NEXT: 5e: 40 60 00 02 bal 8 +; CHECK-NEXT: 62: 30 21 04 5c addiu $1, $1, 1116 +; CHECK-NEXT: 66: 00 3f 09 50 addu $1, $ra, $1 +; CHECK-NEXT: 6a: ff fd 00 00 lw $ra, 0($sp) +; CHECK-NEXT: 6e: 00 01 0f 3c jr $1 +; CHECK-NEXT: 72: 33 bd 00 08 addiu $sp, $sp, 8 +; CHECK-NEXT: 76: 30 20 27 0f addiu $1, $zero, 9999 +; CHECK-NEXT: 7a: 94 22 00 14 beq $2, $1, 44 +; CHECK-NEXT: 7e: 00 00 00 00 nop +; CHECK-NEXT: 82: 33 bd ff f8 addiu $sp, $sp, -8 +; CHECK-NEXT: 86: fb fd 00 00 sw $ra, 0($sp) +; CHECK-NEXT: 8a: 41 a1 00 01 lui $1, 1 +; CHECK-NEXT: 8e: 40 60 00 02 bal 8 +; CHECK-NEXT: 92: 30 21 04 2c addiu $1, $1, 1068 +; CHECK-NEXT: 96: 00 3f 09 50 addu $1, $ra, $1 +; CHECK-NEXT: 9a: ff fd 00 00 lw $ra, 0($sp) +; CHECK-NEXT: 9e: 00 01 0f 3c jr $1 +; CHECK-NEXT: a2: 33 bd 00 08 addiu $sp, $sp, 8 + +; CHECK: 10466: 00 00 00 00 nop +; CHECK-NEXT: 1046a: 94 00 00 02 b 8 +; CHECK-NEXT: 1046e: 00 00 00 00 nop +; CHECK-NEXT: 10472: 33 bd ff f8 addiu $sp, $sp, -8 +; CHECK-NEXT: 10476: fb fd 00 00 sw $ra, 0($sp) +; CHECK-NEXT: 1047a: 41 a1 00 01 lui $1, 1 +; CHECK-NEXT: 1047e: 40 60 00 02 bal 8 +; CHECK-NEXT: 10482: 30 21 04 00 addiu $1, $1, 1024 +; CHECK-NEXT: 10486: 00 3f 09 50 addu $1, $ra, $1 +; CHECK-NEXT: 1048a: ff fd 00 00 lw $ra, 0($sp) +; CHECK-NEXT: 1048e: 00 01 0f 3c jr $1 +; CHECK-NEXT: 10492: 33 bd 00 08 addiu $sp, $sp, 8 +; CHECK-NEXT: 10496: 94 00 00 02 b 8 + +@x = external global i32, align 4 + +define void @foo() { + %1 = load i32, i32* @x, align 4 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %la, label %lf + +la: + switch i32 %1, label %le [ + i32 9999, label %lb + i32 19999, label %lc + ] + +lb: + tail call void asm sideeffect ".space 0", ""() + br label %le + +lc: + tail call void asm sideeffect ".space 0", ""() + br label %le + +le: + tail call void asm sideeffect ".space 66500", ""() + br label %lg + +lf: + tail call void asm sideeffect ".space 0", ""() + br label %lg + +lg: + tail call void asm sideeffect ".space 0", ""() + br label %li + +li: + tail call void asm sideeffect ".space 0", ""() + ret void +} From b7cd010ff5e925f168e6eb7a2687e831073ea5b0 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 05:28:23 +0000 Subject: [PATCH 04/21] Merging r341221: ------------------------------------------------------------------------ r341221 | atanasyan | 2018-08-31 08:57:17 -0700 (Fri, 31 Aug 2018) | 12 lines [mips] Fix `mtc1` and `mfc1` definitions for microMIPS R6 The `mtc1` and `mfc1` definitions in the MipsInstrFPU.td have MMRel, but do not have StdMMR6Rel tags. When these instructions are emitted for microMIPS R6 targets, `Mips::MipsR62MicroMipsR6` nor `Mips::Std2MicroMipsR6` cannot find correct op-codes and as a result the backend uses mips32 variant of the instructions encoding. The patch fixes this problem by adding the StdMMR6Rel tag and check instructions encoding in the test case. Differential revision: https://reviews.llvm.org/D51482 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346737 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMips32r6InstrInfo.td | 2 +- lib/Target/Mips/MipsInstrFPU.td | 8 +-- test/CodeGen/Mips/micromips-mtc-mfc.ll | 68 +++++++++++++++++++++++ 3 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 test/CodeGen/Mips/micromips-mtc-mfc.ll diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td index f795112ae2b79..6b0aa7756eab1 100644 --- a/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -1733,7 +1733,7 @@ defm S_MMR6 : Cmp_Pats, ISA_MICROMIPS32R6; defm D_MMR6 : Cmp_Pats, ISA_MICROMIPS32R6; def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6; -def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1 ZERO))>, ISA_MICROMIPS32R6; +def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1_MMR6 ZERO))>, ISA_MICROMIPS32R6; def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src), (TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6; diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index dd30e20a743cd..e986942ad8fa1 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -485,14 +485,14 @@ let AdditionalPredicates = [NotInMicroMips] in { def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>, ISA_MIPS1; - def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1, - bitconvert>, MFC1_FM<0>, ISA_MIPS1; + def MFC1 : MMRel, StdMMR6Rel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1, + bitconvert>, MFC1_FM<0>, ISA_MIPS1; def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>, ISA_MIPS1, FGR_64 { let DecoderNamespace = "MipsFP64"; } - def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, - bitconvert>, MFC1_FM<4>, ISA_MIPS1; + def MTC1 : MMRel, StdMMR6Rel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, + bitconvert>, MFC1_FM<4>, ISA_MIPS1; def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>, ISA_MIPS1, FGR_64 { let DecoderNamespace = "MipsFP64"; diff --git a/test/CodeGen/Mips/micromips-mtc-mfc.ll b/test/CodeGen/Mips/micromips-mtc-mfc.ll new file mode 100644 index 0000000000000..084c57ab5d26e --- /dev/null +++ b/test/CodeGen/Mips/micromips-mtc-mfc.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=mips -mcpu=mips32r2 -mattr=+micromips \ +; RUN: -show-mc-encoding < %s | FileCheck --check-prefix=MM2 %s +; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+micromips \ +; RUN: -show-mc-encoding < %s | FileCheck --check-prefix=MM6 %s + +define double @foo(double %a, double %b) { +; MM2-LABEL: foo: +; MM2: # %bb.0: # %entry +; MM2-NEXT: mov.d $f0, $f12 # encoding: [0x54,0x0c,0x20,0x7b] +; MM2-NEXT: mtc1 $zero, $f2 # encoding: [0x54,0x02,0x28,0x3b] +; MM2-NEXT: mthc1 $zero, $f2 # encoding: [0x54,0x02,0x38,0x3b] +; MM2-NEXT: c.ule.d $f12, $f2 # encoding: [0x54,0x4c,0x05,0xfc] +; MM2-NEXT: bc1t $BB0_2 # encoding: [0x43,0xa0,A,A] +; MM2-NEXT: # fixup A - offset: 0, value: ($BB0_2), kind: fixup_MICROMIPS_PC16_S1 +; MM2-NEXT: nop # encoding: [0x00,0x00,0x00,0x00] +; MM2-NEXT: # %bb.1: # %entry +; MM2-NEXT: j $BB0_2 # encoding: [0b110101AA,A,A,A] +; MM2-NEXT: # fixup A - offset: 0, value: ($BB0_2), kind: fixup_MICROMIPS_26_S1 +; MM2-NEXT: nop # encoding: [0x00,0x00,0x00,0x00] +; MM2-NEXT: $BB0_2: # %return +; MM2-NEXT: jrc $ra # encoding: [0x45,0xbf] +; +; MM6-LABEL: foo: +; MM6: # %bb.0: # %entry +; MM6-NEXT: mov.d $f0, $f12 # encoding: [0x46,0x20,0x60,0x06] +; MM6-NEXT: mtc1 $zero, $f1 # encoding: [0x54,0x01,0x28,0x3b] +; MM6-NEXT: mthc1 $zero, $f1 # encoding: [0x54,0x01,0x38,0x3b] +; MM6-NEXT: cmp.ule.d $f1, $f12, $f1 # encoding: [0x54,0x2c,0x09,0xd5] +; MM6-NEXT: mfc1 $2, $f1 # encoding: [0x54,0x41,0x20,0x3b] +; MM6-NEXT: andi16 $2, $2, 1 # encoding: [0x2d,0x21] +; MM6-NEXT: jrc $ra # encoding: [0x45,0xbf] +entry: + %cmp = fcmp ogt double %a, 0.000000e+00 + br i1 %cmp, label %if.end, label %if.else + +if.else: + br label %return + +if.end: + %mul = fmul double %a, 2.000000e+00 + br label %return + +return: + ret double %a +} + +define double @bar(double %x, double %y) { +; MM2-LABEL: bar: +; MM2: # %bb.0: # %entry +; MM2-NEXT: mov.d $f0, $f14 # encoding: [0x54,0x0e,0x20,0x7b] +; MM2-NEXT: c.olt.d $f12, $f14 # encoding: [0x55,0xcc,0x05,0x3c] +; MM2-NEXT: jr $ra # encoding: [0x00,0x1f,0x0f,0x3c] +; MM2-NEXT: movt.d $f0, $f12, $fcc0 # encoding: [0x54,0x0c,0x02,0x60] +; +; MM6-LABEL: bar: +; MM6: # %bb.0: # %entry +; MM6-NEXT: cmp.lt.d $f0, $f12, $f14 # encoding: [0x55,0xcc,0x01,0x15] +; MM6-NEXT: mfc1 $1, $f0 # encoding: [0x54,0x20,0x20,0x3b] +; MM6-NEXT: mtc1 $1, $f0 # encoding: [0x44,0x81,0x00,0x00] +; MM6-NEXT: sel.d $f0, $f14, $f12 # encoding: [0x55,0x8e,0x02,0xb8] +; MM6-NEXT: jrc $ra # encoding: [0x45,0xbf] +; FIXME: mtc1 is encoded as a regular non-microMIPS instruction +entry: + %z = fcmp olt double %x, %y + %r = select i1 %z, double %x, double %y + ret double %r +} From 6ef29028d44462250fa7e99ed0627e73757d1777 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 05:43:07 +0000 Subject: [PATCH 05/21] Merging r341919: ------------------------------------------------------------------------ r341919 | atanasyan | 2018-09-11 02:57:25 -0700 (Tue, 11 Sep 2018) | 18 lines [mips] Add a pattern for 64-bit GPR variant of the `rdhwr` instruction MIPS ISAs start to support third operand for the `rdhwr` instruction starting from Revision 6. But LLVM generates assembler code with three-operands version of this instruction on any MIPS64 ISA. The third operand is always zero, so in case of direct code generation we get correct code. This patch fixes the bug by adding an instruction alias. The same alias already exists for 32-bit ISA. Ideally, we also need to reject three-operands version of the `rdhwr` instruction in an assembler code if ISA revision is less than 6. That is a task for a separate patch. This fixes PR38861 (https://bugs.llvm.org/show_bug.cgi?id=38861) Differential revision: https://reviews.llvm.org/D51773 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346739 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/Mips64InstrInfo.td | 3 +++ test/CodeGen/Mips/tls.ll | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 878ec29b188d6..fb6b4affacb35 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -1139,3 +1139,6 @@ def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs), "sltu\t$rs, $rt, $imm">, GPR_64; def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs, imm64:$imm)>, GPR_64; + +def : MipsInstAlias<"rdhwr $rt, $rs", + (RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64; diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll index 3ef9d63196c3d..126cfea97287b 100644 --- a/test/CodeGen/Mips/tls.ll +++ b/test/CodeGen/Mips/tls.ll @@ -48,14 +48,14 @@ entry: ; STATIC32-LABEL: f1: ; STATIC32: lui $[[R0:[0-9]+]], %tprel_hi(t1) ; STATIC32: addiu $[[R1:[0-9]+]], $[[R0]], %tprel_lo(t1) -; STATIC32: rdhwr $3, $29 +; STATIC32: rdhwr $3, $29{{$}} ; STATIC32: addu $[[R2:[0-9]+]], $3, $[[R1]] ; STATIC32: lw $2, 0($[[R2]]) ; STATIC64-LABEL: f1: ; STATIC64: lui $[[R0:[0-9]+]], %tprel_hi(t1) ; STATIC64: daddiu $[[R1:[0-9]+]], $[[R0]], %tprel_lo(t1) -; STATIC64: rdhwr $3, $29, 0 +; STATIC64: rdhwr $3, $29{{$}} ; STATIC64: daddu $[[R2:[0-9]+]], $3, $[[R0]] ; STATIC64: lw $2, 0($[[R2]]) } @@ -101,7 +101,7 @@ entry: ; STATIC32-LABEL: f2: ; STATIC32: lui $[[R0:[0-9]+]], %hi(__gnu_local_gp) ; STATIC32: addiu $[[GP:[0-9]+]], $[[R0]], %lo(__gnu_local_gp) -; STATIC32: rdhwr $3, $29 +; STATIC32: rdhwr $3, $29{{$}} ; STATIC32: lw $[[R0:[0-9]+]], %gottprel(t2)($[[GP]]) ; STATIC32: addu $[[R1:[0-9]+]], $3, $[[R0]] ; STATIC32: lw $2, 0($[[R1]]) @@ -109,7 +109,7 @@ entry: ; STATIC64-LABEL: f2: ; STATIC64: lui $[[R0:[0-9]+]], %hi(%neg(%gp_rel(f2))) ; STATIC64: daddiu $[[GP:[0-9]+]], $[[R0]], %lo(%neg(%gp_rel(f2))) -; STATIC64: rdhwr $3, $29 +; STATIC64: rdhwr $3, $29{{$}} ; STATIC64: ld $[[R0:[0-9]+]], %gottprel(t2)($[[GP]]) ; STATIC64: daddu $[[R1:[0-9]+]], $3, $[[R0]] ; STATIC64: lw $2, 0($[[R1]]) From 2673179dd319fc771fae1edd982bf01ea91df5b2 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 05:58:13 +0000 Subject: [PATCH 06/21] Merging r342884: ------------------------------------------------------------------------ r342884 | petarj | 2018-09-24 07:14:19 -0700 (Mon, 24 Sep 2018) | 12 lines [Mips][FastISel] Fix selectBranch on icmp i1 The r337288 tried to fix result of icmp i1 when its input is not sanitized by falling back to DagISel. While it now produces the correct result for bit 0, the other bits can still hold arbitrary value which is not supported by MipsFastISel branch lowering. This patch fixes the issue by falling back to DagISel in this case. Patch by Dragan Mladjenovic. Differential Revision: https://reviews.llvm.org/D52045 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346741 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsFastISel.cpp | 5 + test/CodeGen/Mips/Fast-ISel/bricmpi1.ll | 189 ++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 test/CodeGen/Mips/Fast-ISel/bricmpi1.ll diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 19b30a44e86ad..22ade31a72cdb 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -953,6 +953,11 @@ bool MipsFastISel::selectBranch(const Instruction *I) { MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; // For now, just try the simplest case where it's fed by a compare. if (const CmpInst *CI = dyn_cast(BI->getCondition())) { + MVT CIMVT = + TLI.getValueType(DL, CI->getOperand(0)->getType(), true).getSimpleVT(); + if (CIMVT == MVT::i1) + return false; + unsigned CondReg = getRegForValue(CI); BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ)) .addReg(CondReg) diff --git a/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll new file mode 100644 index 0000000000000..47b3c92203d1b --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll @@ -0,0 +1,189 @@ +; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel=true -mcpu=mips32r2 \ +; RUN: < %s -verify-machineinstrs | FileCheck %s + +define void @testeq(i32, i32) { +; CHECK-LABEL: testeq: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: beq $[[REG0]], $[[REG1]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp eq i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testne(i32, i32) { +; CHECK-LABEL: testne: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: bne $[[REG0]], $[[REG1]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp ne i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testugt(i32, i32) { +; CHECK-LABEL: testugt: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: sltu $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]] +; CHECK: bnez $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp ugt i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testuge(i32, i32) { +; CHECK-LABEL: testuge: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: sltu $[[REG2:[0-9]+]], $[[REG0]], $[[REG1]] +; CHECK: beqz $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp uge i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testult(i32, i32) { +; CHECK-LABEL: testult: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: sltu $[[REG2:[0-9]+]], $[[REG0]], $[[REG1]] +; CHECK: bnez $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp ult i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testule(i32, i32) { +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: sltu $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]] +; CHECK: beqz $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp ule i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testsgt(i32, i32) { +; CHECK-LABEL: testsgt: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: negu $[[REG0]], $[[REG0]] +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: negu $[[REG1]], $[[REG1]] +; CHECK: slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]] +; CHECK: bnez $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp sgt i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testsge(i32, i32) { +; CHECK-LABEL: testsge: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: negu $[[REG0]], $[[REG0]] +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: negu $[[REG1]], $[[REG1]] +; CHECK: slt $[[REG2:[0-9]+]], $[[REG0]], $[[REG1]] +; CHECK: beqz $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp sge i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testslt(i32, i32) { +; CHECK-LABEL: testslt: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: negu $[[REG0]], $[[REG0]] +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: negu $[[REG1]], $[[REG1]] +; CHECK: slt $[[REG2:[0-9]+]], $[[REG0]], $[[REG1]] +; CHECK: bnez $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp slt i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +define void @testsle(i32, i32) { +; CHECK-LABEL: testsle: +; CHECK: andi $[[REG0:[0-9]+]], $4, 1 +; CHECK: negu $[[REG0]], $[[REG0]] +; CHECK: andi $[[REG1:[0-9]+]], $5, 1 +; CHECK: negu $[[REG1]], $[[REG1]] +; CHECK: slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]] +; CHECK: beqz $[[REG2]], + %3 = trunc i32 %0 to i1 + %4 = trunc i32 %1 to i1 + %5 = icmp sle i1 %3, %4 + br i1 %5, label %end, label %trap +trap: + call void @llvm.trap() + br label %end +end: + ret void +} + + +declare void @llvm.trap() From 7d87789bb6f9c5246c8dd8daf93fcaef6471abaf Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 13 Nov 2018 06:06:53 +0000 Subject: [PATCH 07/21] Merging r342946: ------------------------------------------------------------------------ r342946 | smaksimovic | 2018-09-24 23:27:49 -0700 (Mon, 24 Sep 2018) | 6 lines [mips] Correct MUL pattern for mips64 Guard existing pattern with a predicate, introduce a new one for revision 6. Differential Revision: https://reviews.llvm.org/D51684 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@346742 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/Mips64InstrInfo.td | 2 +- lib/Target/Mips/Mips64r6InstrInfo.td | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index fb6b4affacb35..b5317bec70c47 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -838,7 +838,7 @@ def : MipsPat<(i64 (sext (i32 (sub GPR32:$src, GPR32:$src2)))), (SUBu GPR32:$src, GPR32:$src2), sub_32)>; def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (MUL GPR32:$src, GPR32:$src2), sub_32)>; + (MUL GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS3_NOT_32R6_64R6; def : MipsPat<(i64 (sext (i32 (MipsMFHI ACC64:$src)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (PseudoMFHI ACC64:$src), sub_32)>; diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td index 9df802cc30b93..ac223bc772561 100644 --- a/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/lib/Target/Mips/Mips64r6InstrInfo.td @@ -301,6 +301,9 @@ def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f), // Patterns used for matching away redundant sign extensions. // MIPS32 arithmetic instructions sign extend their result implicitly. +def : MipsPat<(i64 (sext (i32 (mul GPR32:$src, GPR32:$src2)))), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MUL_R6 GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6; def : MipsPat<(i64 (sext (i32 (sdiv GPR32:$src, GPR32:$src2)))), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (DIV GPR32:$src, GPR32:$src2), sub_32)>, ISA_MIPS64R6; From 7515784defa9816244a243edb6cb070386525fa5 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 16 Nov 2018 04:22:24 +0000 Subject: [PATCH 08/21] Merging r344516: ------------------------------------------------------------------------ r344516 | abeserminji | 2018-10-15 07:39:12 -0700 (Mon, 15 Oct 2018) | 12 lines [mips][micromips] Fix overlaping FDEs error When compiling static executable for micromips, CFI symbols are incorrectly labeled as MICROMIPS, which cause ".eh_frame_hdr refers to overlapping FDEs." error. This patch does not label CFI symbols as MICROMIPS, and FDEs do not overlap anymore. This patch also exposes another bug, which is fixed here: https://reviews.llvm.org/D52985 Differential Revision: https://reviews.llvm.org/D52987 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347023 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Mips/MCTargetDesc/MipsELFStreamer.cpp | 17 +++++++++ .../Mips/MCTargetDesc/MipsELFStreamer.h | 7 ++++ test/DebugInfo/Mips/eh_frame.ll | 38 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 test/DebugInfo/Mips/eh_frame.ll diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 7b9a02503ce21..21b01e8509678 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" @@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst, createPendingLabelRelocs(); } +void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { + Frame.Begin = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.Begin); +} + +MCSymbol *MipsELFStreamer::EmitCFILabel() { + MCSymbol *Label = getContext().createTempSymbol("cfi", true); + MCELFStreamer::EmitLabel(Label); + return Label; +} + +void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { + Frame.End = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.End); +} + void MipsELFStreamer::createPendingLabelRelocs() { MipsTargetELFStreamer *ELFTargetStreamer = static_cast(getTargetStreamer()); diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index d141f5d77c61b..56a0ff96c7bd6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -26,6 +26,7 @@ class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCSubtargetInfo; +struct MCDwarfFrameInfo; class MipsELFStreamer : public MCELFStreamer { SmallVector, 8> MipsOptionRecords; @@ -60,6 +61,12 @@ class MipsELFStreamer : public MCELFStreamer { void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; void EmitIntValue(uint64_t Value, unsigned Size) override; + // Overriding these functions allows us to avoid recording of these labels + // in EmitLabel and later marking them as microMIPS. + void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; + void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; + MCSymbol *EmitCFILabel() override; + /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll new file mode 100644 index 0000000000000..4687443cb1cff --- /dev/null +++ b/test/DebugInfo/Mips/eh_frame.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s + +; CHECK: .rel.eh_frame +; CHECK: DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .text +; CHECK-NEXT: .gcc_except_table + +@_ZTIi = external constant i8* + +define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind + %0 = bitcast i8* %exception.i to i32* + store i32 5, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn + to label %.noexc unwind label %return + +.noexc: + unreachable + +return: + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind + tail call void @__cxa_end_catch() + ret i32 0 +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr + +declare void @__cxa_end_catch() local_unnamed_addr + +declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr From 05998067a2e562cbcbd8970b32bf9ce4f3eb8064 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 16 Nov 2018 05:16:44 +0000 Subject: [PATCH 09/21] Merging r344591: ------------------------------------------------------------------------ r344591 | abeserminji | 2018-10-16 01:27:28 -0700 (Tue, 16 Oct 2018) | 11 lines [mips][micromips] Fix how values in .gcc_except_table are calculated When a landing pad is calculated in a program that is compiled for micromips, it will point to an even address. Such an error will cause a segmentation fault, as the instructions in micromips are aligned on odd addresses. This patch sets the last bit of the offset where a landing pad is, to 1, which will effectively be an odd address and point to the instruction exactly. Differential Revision: https://reviews.llvm.org/D52985 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347028 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCAsmBackend.h | 5 +++ lib/MC/MCExpr.cpp | 5 +++ .../Mips/MCTargetDesc/MipsAsmBackend.cpp | 9 +++++ lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h | 2 + .../Mips/micromips-gcc-except-table.ll | 37 +++++++++++++++++++ 5 files changed, 58 insertions(+) create mode 100644 test/CodeGen/Mips/micromips-gcc-except-table.ll diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h index 030d3c05aa5ad..07835c21fcedb 100644 --- a/include/llvm/MC/MCAsmBackend.h +++ b/include/llvm/MC/MCAsmBackend.h @@ -165,6 +165,11 @@ class MCAsmBackend { return 0; } + /// Check whether a given symbol has been flagged with MICROMIPS flag. + virtual bool isMicroMips(const MCSymbol *Sym) const { + return false; + } + /// Handles all target related code padding when starting to write a new /// basic block to an object file. /// diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp index a4c99a0c1c152..ef6f0041e0c87 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -524,6 +524,11 @@ static void AttemptToFoldSymbolOffsetDifference( if (Asm->isThumbFunc(&SA)) Addend |= 1; + // If symbol is labeled as micromips, we set low-bit to ensure + // correct offset in .gcc_except_table + if (Asm->getBackend().isMicroMips(&SA)) + Addend |= 1; + // Clear the symbol expr pointers to indicate we have folded these // operands. A = B = nullptr; diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 4397c971d0802..3b1b94acb1493 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -25,6 +25,7 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -568,6 +569,14 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm, } } +bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const { + if (const auto *ElfSym = dyn_cast(Sym)) { + if (ElfSym->getOther() & ELF::STO_MIPS_MICROMIPS) + return true; + } + return false; +} + MCAsmBackend *llvm::createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 3d5e16fcf9b41..30359132e92b2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -25,6 +25,7 @@ class MCAssembler; struct MCFixupKindInfo; class MCObjectWriter; class MCRegisterInfo; +class MCSymbolELF; class Target; class MipsAsmBackend : public MCAsmBackend { @@ -90,6 +91,7 @@ class MipsAsmBackend : public MCAsmBackend { bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override; + bool isMicroMips(const MCSymbol *Sym) const override; }; // class MipsAsmBackend } // namespace diff --git a/test/CodeGen/Mips/micromips-gcc-except-table.ll b/test/CodeGen/Mips/micromips-gcc-except-table.ll new file mode 100644 index 0000000000000..38a76927e2a8a --- /dev/null +++ b/test/CodeGen/Mips/micromips-gcc-except-table.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -O3 -filetype=obj < %s | llvm-objdump -s -j .gcc_except_table - | FileCheck %s + +; CHECK: Contents of section .gcc_except_table: +; CHECK-NEXT: 0000 ff9b1501 0c011100 00110e1f 011f1800 +; CHECK-NEXT: 0010 00010000 00000000 + +@_ZTIi = external constant i8* + +define dso_local i32 @main() local_unnamed_addr norecurse personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind + %0 = bitcast i8* %exception.i to i32* + store i32 5, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn + to label %.noexc unwind label %return + +.noexc: + unreachable + +return: + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind + tail call void @__cxa_end_catch() + ret i32 0 +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr + +declare void @__cxa_end_catch() local_unnamed_addr + +declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr From 5993754bccbfd2e500ba7747841eb169d41512b2 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 20 Nov 2018 03:42:43 +0000 Subject: [PATCH 10/21] Merging r345353: ------------------------------------------------------------------------ r345353 | sima | 2018-10-25 18:28:36 -0700 (Thu, 25 Oct 2018) | 21 lines Teach the DominatorTree fallback to recalculation when applying updates to speedup JT (PR37929) Summary: This patch makes the dominatortree recalculate when applying updates with the size of the update vector larger than a threshold. Directly applying updates is usually slower than recalculating the whole domtree in this case. This patch fixes an issue which causes JT running slowly on some inputs. In bug 37929, the dominator tree is trying to apply 19,000+ updates several times, which takes several minutes. After this patch, the time used by DT.applyUpdates: | Input | Before (s) | After (s) | Speedup | | the 2nd Reproducer in 37929 | 297 | 0.15 | 1980x | | clang-5.0.0.0.bc | 9.7 | 4.3 | 2.26x | | clang-5.0.0.4.bc | 11.6 | 2.6 | 4.46x | Reviewers: kuhar, brzycki, trentxintong, davide, dmgreen, grosser Reviewed By: kuhar, brzycki Subscribers: kristina, llvm-commits Differential Revision: https://reviews.llvm.org/D53245 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347285 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/GenericDomTreeConstruction.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h index 103ff8ca476a9..977f209f92b39 100644 --- a/include/llvm/Support/GenericDomTreeConstruction.h +++ b/include/llvm/Support/GenericDomTreeConstruction.h @@ -1186,6 +1186,20 @@ struct SemiNCAInfo { << '\t' << U << "\n"); LLVM_DEBUG(dbgs() << "\n"); + // Recalculate the DominatorTree when the number of updates + // exceeds a threshold, which usually makes direct updating slower than + // recalculation. We select this threshold proportional to the + // size of the DominatorTree. The constant is selected + // by choosing the one with an acceptable performance on some real-world + // inputs. + + // Make unittests of the incremental algorithm work + if (DT.DomTreeNodes.size() <= 100) { + if (NumLegalized > DT.DomTreeNodes.size()) + CalculateFromScratch(DT, &BUI); + } else if (NumLegalized > DT.DomTreeNodes.size() / 40) + CalculateFromScratch(DT, &BUI); + // If the DominatorTree was recalculated at some point, stop the batch // updates. Full recalculations ignore batch updates and look at the actual // CFG. From 92101a335c58418009c450d34e37b85af08cbbe7 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 28 Nov 2018 21:46:14 +0000 Subject: [PATCH 11/21] Merging r342865: ------------------------------------------------------------------------ r342865 | courbet | 2018-09-24 01:39:48 -0700 (Mon, 24 Sep 2018) | 11 lines [llvm-exegesis] Fix PR39021. Summary: The `set` statements was incorrectly reading the value of the local variable and setting the value of the parent variable. Reviewers: tycho, gchatelet, john.brawn Subscribers: mgorny, tschuett, llvm-commits Differential Revision: https://reviews.llvm.org/D52343 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347811 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/llvm-exegesis/lib/CMakeLists.txt b/tools/llvm-exegesis/lib/CMakeLists.txt index 175c2adf9de85..194304adf9851 100644 --- a/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/tools/llvm-exegesis/lib/CMakeLists.txt @@ -1,12 +1,16 @@ +set(TARGETS_TO_APPEND "") + if (LLVM_TARGETS_TO_BUILD MATCHES "X86") add_subdirectory(X86) - set(LLVM_EXEGESIS_TARGETS "${LLVM_EXEGESIS_TARGETS} X86" PARENT_SCOPE) + set(TARGETS_TO_APPEND "${TARGETS_TO_APPEND} X86") endif() if (LLVM_TARGETS_TO_BUILD MATCHES "AArch64") add_subdirectory(AArch64) - set(LLVM_EXEGESIS_TARGETS "${LLVM_EXEGESIS_TARGETS} AArch64" PARENT_SCOPE) + set(TARGETS_TO_APPEND "${TARGETS_TO_APPEND} AArch64") endif() +set(LLVM_EXEGESIS_TARGETS "${LLVM_EXEGESIS_TARGETS} ${TARGETS_TO_APPEND}" PARENT_SCOPE) + add_library(LLVMExegesis STATIC Analysis.cpp From f8ee49e3d8b9a064d424e30c1d97086509e8556d Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 29 Nov 2018 23:40:59 +0000 Subject: [PATCH 12/21] Merging r347431: ------------------------------------------------------------------------ r347431 | rnk | 2018-11-21 14:01:10 -0800 (Wed, 21 Nov 2018) | 12 lines [mingw] Use unmangled name after the $ in the section name GCC does it this way, and we have to be consistent. This includes stdcall and fastcall functions with suffixes. I confirmed that a fastcall function named "foo" ends up in ".text$foo", not ".text$@foo@8". Based on a patch by Andrew Yohn! Fixes PR39218. Differential Revision: https://reviews.llvm.org/D54762 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347931 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 5 +-- test/CodeGen/X86/mingw-comdats.ll | 33 +++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index f6b91a2f0231f..16140f0b12be6 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1156,10 +1156,11 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal( MCSymbol *Sym = TM.getSymbol(ComdatGV); StringRef COMDATSymName = Sym->getName(); - // Append "$symbol" to the section name when targetting mingw. The ld.bfd + // Append "$symbol" to the section name *before* IR-level mangling is + // applied when targetting mingw. This is what GCC does, and the ld.bfd // COFF linker will not properly handle comdats otherwise. if (getTargetTriple().isWindowsGNUEnvironment()) - raw_svector_ostream(Name) << '$' << COMDATSymName; + raw_svector_ostream(Name) << '$' << ComdatGV->getName(); return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName, Selection, UniqueID); diff --git a/test/CodeGen/X86/mingw-comdats.ll b/test/CodeGen/X86/mingw-comdats.ll index 2e9ebd8c9fc4c..35f4fd12670c2 100644 --- a/test/CodeGen/X86/mingw-comdats.ll +++ b/test/CodeGen/X86/mingw-comdats.ll @@ -1,13 +1,14 @@ -; RUN: llc -mtriple=x86_64-windows-itanium < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU -; RUN: llc -mtriple=i686-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU32 -; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - -headers | FileCheck %s --check-prefix=GNUOBJ +; RUN: llc -function-sections -mtriple=x86_64-windows-itanium < %s | FileCheck %s +; RUN: llc -function-sections -mtriple=x86_64-windows-msvc < %s | FileCheck %s +; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU +; RUN: llc -function-sections -mtriple=i686-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU32 +; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - -headers | FileCheck %s --check-prefix=GNUOBJ ; GCC and MSVC handle comdats completely differently. Make sure we do the right ; thing for each. -; Generated with this C++ source: +; Modeled on this C++ source, with additional modifications for +; -ffunction-sections: ; int bar(int); ; __declspec(selectany) int gv = 42; ; inline int foo(int x) { return bar(x) + gv; } @@ -26,8 +27,24 @@ entry: ret i32 %call } +; CHECK: .section .text,"xr",one_only,main ; CHECK: main: +; GNU: .section .text$main,"xr",one_only,main ; GNU: main: +; GNU32: .section .text$main,"xr",one_only,_main +; GNU32: _main: + +define dso_local x86_fastcallcc i32 @fastcall(i32 %x, i32 %y) { + %rv = add i32 %x, %y + ret i32 %rv +} + +; CHECK: .section .text,"xr",one_only,fastcall +; CHECK: fastcall: +; GNU: .section .text$fastcall,"xr",one_only,fastcall +; GNU: fastcall: +; GNU32: .section .text$fastcall,"xr",one_only,@fastcall@8 +; GNU32: @fastcall@8: ; Function Attrs: inlinehint uwtable define linkonce_odr dso_local i32 @_Z3fooi(i32 %x) #1 comdat { @@ -50,9 +67,9 @@ entry: ; GNU: gv: ; GNU: .long 42 -; GNU32: .section .text$__Z3fooi,"xr",discard,__Z3fooi +; GNU32: .section .text$_Z3fooi,"xr",discard,__Z3fooi ; GNU32: __Z3fooi: -; GNU32: .section .data$_gv,"dw",discard,_gv +; GNU32: .section .data$gv,"dw",discard,_gv ; GNU32: _gv: ; GNU32: .long 42 From e8af9b4c407003c1c771aa16fe2b3c747a0d344e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 30 Nov 2018 04:51:41 +0000 Subject: [PATCH 13/21] Merging r339260: ------------------------------------------------------------------------ r339260 | syzaara | 2018-08-08 08:20:43 -0700 (Wed, 08 Aug 2018) | 13 lines [PowerPC] Improve codegen for vector loads using scalar_to_vector This patch aims to improve the codegen for vector loads involving the scalar_to_vector (load X) sequence. Initially, ld->mv instructions were used for scalar_to_vector (load X), so this patch allows scalar_to_vector (load X) to utilize: LXSD and LXSDX for i64 and f64 LXSIWAX for i32 (sign extension to i64) LXSIWZX for i32 and f64 Committing on behalf of Amy Kwan. Differential Revision: https://reviews.llvm.org/D48950 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@347957 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/P9InstrResources.td | 1 + lib/Target/PowerPC/PPCISelLowering.cpp | 11 - lib/Target/PowerPC/PPCInstrVSX.td | 91 ++++- test/CodeGen/PowerPC/VSX-XForm-Scalars.ll | 61 ++-- test/CodeGen/PowerPC/build-vector-tests.ll | 44 ++- test/CodeGen/PowerPC/load-v4i8-improved.ll | 22 +- .../PowerPC/power9-moves-and-splats.ll | 246 +++++++++---- test/CodeGen/PowerPC/pr38087.ll | 5 +- test/CodeGen/PowerPC/qpx-load-splat.ll | 62 ++-- test/CodeGen/PowerPC/scalar_vector_test_1.ll | 292 +++++++++++++++ test/CodeGen/PowerPC/scalar_vector_test_2.ll | 118 ++++++ test/CodeGen/PowerPC/scalar_vector_test_3.ll | 265 ++++++++++++++ test/CodeGen/PowerPC/scalar_vector_test_4.ll | 341 ++++++++++++++++++ test/CodeGen/PowerPC/swaps-le-6.ll | 89 +++-- test/CodeGen/PowerPC/vsx_insert_extract_le.ll | 123 +++++-- 15 files changed, 1529 insertions(+), 242 deletions(-) create mode 100644 test/CodeGen/PowerPC/scalar_vector_test_1.ll create mode 100644 test/CodeGen/PowerPC/scalar_vector_test_2.ll create mode 100644 test/CodeGen/PowerPC/scalar_vector_test_3.ll create mode 100644 test/CodeGen/PowerPC/scalar_vector_test_4.ll diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td index 34df8452fe166..c6cbb9037ede1 100644 --- a/lib/Target/PowerPC/P9InstrResources.td +++ b/lib/Target/PowerPC/P9InstrResources.td @@ -592,6 +592,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], XXPERM, XXPERMR, XXSLDWI, + XXSLDWIs, XXSPLTIB, XXSPLTW, XXSPLTWs, diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 331dbcbbe060e..b5bdf47ce37ae 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -8454,17 +8454,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); - // If the source for the shuffle is a scalar_to_vector that came from a - // 32-bit load, it will have used LXVWSX so we don't need to splat again. - if (Subtarget.hasP9Vector() && - ((isLittleEndian && SplatIdx == 3) || - (!isLittleEndian && SplatIdx == 0))) { - SDValue Src = V1.getOperand(0); - if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && - Src.getOperand(0).getOpcode() == ISD::LOAD && - Src.getOperand(0).hasOneUse()) - return V1; - } SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, DAG.getConstant(SplatIdx, dl, MVT::i32)); diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 183512acaf9e1..781a3277441af 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -877,6 +877,12 @@ let Uses = [RM] in { "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, [(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB, imm32SExt16:$SHW))]>; + + let isCodeGenOnly = 1 in + def XXSLDWIs : XX3Form_2s<60, 2, + (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$SHW), + "xxsldwi $XT, $XA, $XA, $SHW", IIC_VecPerm, []>; + def XXSPLTW : XX2Form_2<60, 164, (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM), "xxspltw $XT, $XB, $UIM", IIC_VecPerm, @@ -886,6 +892,7 @@ let Uses = [RM] in { def XXSPLTWs : XX2Form_2<60, 164, (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM), "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; + } // hasSideEffects } // UseVSXReg = 1 @@ -1466,8 +1473,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; } - def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)), - (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>; // Instructions for converting float to i64 feeding a store. let Predicates = [NoP9Vector] in { @@ -3050,13 +3055,47 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), - (v4i32 (LXVWSX xoaddr:$src))>; - def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), - (v4f32 (LXVWSX xoaddr:$src))>; - def : Pat<(v4f32 (scalar_to_vector - (f32 (fpround (f64 (extloadf32 xoaddr:$src)))))), - (v4f32 (LXVWSX xoaddr:$src))>; + + let AddedComplexity = 400 in { + // LIWAX - This instruction is used for sign extending i32 -> i64. + // LIWZX - This instruction will be emitted for i32, f32, and when + // zero-extending i32 to i64 (zext i32 -> i64). + let Predicates = [IsLittleEndian] in { + + def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))), + (v2i64 (XXPERMDIs + (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC), 2))>; + + def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))), + (v2i64 (XXPERMDIs + (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>; + + def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), + (v4i32 (XXPERMDIs + (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>; + + def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), + (v4f32 (XXPERMDIs + (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>; + } + + let Predicates = [IsBigEndian] in { + def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))), + (v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>; + + def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))), + (v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>; + + def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), + (v4i32 (XXSLDWIs + (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>; + + def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), + (v4f32 (XXSLDWIs + (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>; + } + + } // Build vectors from i8 loads def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)), @@ -3218,6 +3257,39 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))), (f32 (DFLOADf32 ixaddr:$src))>; + + let AddedComplexity = 400 in { + // The following pseudoinstructions are used to ensure the utilization + // of all 64 VSX registers. + let Predicates = [IsLittleEndian, HasP9Vector] in { + def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))), + (v2i64 (XXPERMDIs + (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>; + def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))), + (v2i64 (XXPERMDIs + (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>; + + def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))), + (v2f64 (XXPERMDIs + (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>; + def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))), + (v2f64 (XXPERMDIs + (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>; + } + + let Predicates = [IsBigEndian, HasP9Vector] in { + def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))), + (v2i64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>; + def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))), + (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>; + + def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))), + (v2f64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>; + def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))), + (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>; + } + } + let Predicates = [IsBigEndian, HasP9Vector] in { // (Un)Signed DWord vector extract -> QP @@ -3932,3 +4004,4 @@ let AddedComplexity = 400 in { (v4i32 (VEXTSH2W $A))>; } } + diff --git a/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll b/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll index e38c5beb80ebb..643ec904ea38f 100644 --- a/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll +++ b/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll @@ -1,35 +1,46 @@ ; RUN: llc < %s -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \ -; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P8 +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=CHECK-P8 ; RUN: llc < %s -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \ -; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P9 +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=CHECK-P9 @a = external local_unnamed_addr global <4 x i32>, align 16 @pb = external local_unnamed_addr global float*, align 8 define void @testExpandPostRAPseudo(i32* nocapture readonly %ptr) { -; CHECK-P8-LABEL: testExpandPostRAPseudo: -; CHECK-P8: lxsiwax 34, 0, 3 -; CHECK-P8-NEXT: xxspltw 34, 34, 1 -; CHECK-P8-NEXT: stvx 2, 0, 4 -; CHECK-P8: #APP -; CHECK-P8-NEXT: #Clobber Rigisters -; CHECK-P8-NEXT: #NO_APP -; CHECK-P8-NEXT: lis 4, 1024 -; CHECK-P8-NEXT: lfiwax 0, 0, 3 -; CHECK-P8: stfsx 0, 3, 4 -; CHECK-P8-NEXT: blr - -; CHECK-P9-LABEL: testExpandPostRAPseudo: -; CHECK-P9: lxvwsx 0, 0, 3 -; CHECK-P9: stxvx 0, 0, 4 -; CHECK-P9: #APP -; CHECK-P9-NEXT: #Clobber Rigisters -; CHECK-P9-NEXT: #NO_APP -; CHECK-P9-NEXT: lis 4, 1024 -; CHECK-P9-NEXT: lfiwax 0, 0, 3 -; CHECK-P9: stfsx 0, 3, 4 -; CHECK-P9-NEXT: blr - +; CHECK-P8-LABEL: testExpandPostRAPseudo: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8: lfiwzx f0, 0, r3 +; CHECK-P8: ld r4, .LC0@toc@l(r4) +; CHECK-P8: xxpermdi vs0, f0, f0, 2 +; CHECK-P8: xxspltw v2, vs0, 3 +; CHECK-P8: stvx v2, 0, r4 +; CHECK-P8: lis r4, 1024 +; CHECK-P8: lfiwax f0, 0, r3 +; CHECK-P8: addis r3, r2, .LC1@toc@ha +; CHECK-P8: ld r3, .LC1@toc@l(r3) +; CHECK-P8: xscvsxdsp f0, f0 +; CHECK-P8: ld r3, 0(r3) +; CHECK-P8: stfsx f0, r3, r4 +; CHECK-P8: blr +; +; CHECK-P9-LABEL: testExpandPostRAPseudo: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: lfiwzx f0, 0, r3 +; CHECK-P9: addis r4, r2, .LC0@toc@ha +; CHECK-P9: ld r4, .LC0@toc@l(r4) +; CHECK-P9: xxpermdi vs0, f0, f0, 2 +; CHECK-P9: xxspltw vs0, vs0, 3 +; CHECK-P9: stxvx vs0, 0, r4 +; CHECK-P9: lis r4, 1024 +; CHECK-P9: lfiwax f0, 0, r3 +; CHECK-P9: addis r3, r2, .LC1@toc@ha +; CHECK-P9: ld r3, .LC1@toc@l(r3) +; CHECK-P9: xscvsxdsp f0, f0 +; CHECK-P9: ld r3, 0(r3) +; CHECK-P9: stfsx f0, r3, r4 +; CHECK-P9: blr entry: %0 = load i32, i32* %ptr, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll index f074e2a0c0d02..d192bafca2355 100644 --- a/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/test/CodeGen/PowerPC/build-vector-tests.ll @@ -109,8 +109,8 @@ ;vector int spltRegVali(int val) { // ; return (vector int) val; // ;} // -;// P8: lxsiwax, xxspltw // -;// P9: lxvwsx // +;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // +;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // ;vector int spltMemVali(int *ptr) { // ; return (vector int)*ptr; // ;} // @@ -284,8 +284,8 @@ ;vector unsigned int spltRegValui(unsigned int val) { // ; return (vector unsigned int) val; // ;} // -;// P8: lxsiwax, xxspltw // -;// P9: lxvwsx // +;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // +;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw // ;vector unsigned int spltMemValui(unsigned int *ptr) { // ; return (vector unsigned int)*ptr; // ;} // @@ -1202,15 +1202,21 @@ entry: ; P9LE-LABEL: spltMemVali ; P8BE-LABEL: spltMemVali ; P8LE-LABEL: spltMemVali -; P9BE: lxvwsx v2, 0, r3 +; P9BE: lfiwzx f0, 0, r3 +; P9BE: xxsldwi vs0, f0, f0, 1 +; P9BE: xxspltw v2, vs0, 0 ; P9BE: blr -; P9LE: lxvwsx v2, 0, r3 +; P9LE: lfiwzx f0, 0, r3 +; P9LE: xxpermdi vs0, f0, f0, 2 +; P9LE: xxspltw v2, vs0, 3 ; P9LE: blr -; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3 -; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 +; P8BE: lfiwzx f0, 0, r3 +; P8BE: xxsldwi vs0, f0, f0, 1 +; P8BE: xxspltw v2, vs0, 0 ; P8BE: blr -; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3 -; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 +; P8LE: lfiwzx f0, 0, r3 +; P8LE: xxpermdi vs0, f0, f0, 2 +; P8LE: xxspltw v2, vs0, 3 ; P8LE: blr } @@ -2338,15 +2344,21 @@ entry: ; P9LE-LABEL: spltMemValui ; P8BE-LABEL: spltMemValui ; P8LE-LABEL: spltMemValui -; P9BE: lxvwsx v2, 0, r3 +; P9BE: lfiwzx f0, 0, r3 +; P9BE: xxsldwi vs0, f0, f0, 1 +; P9BE: xxspltw v2, vs0, 0 ; P9BE: blr -; P9LE: lxvwsx v2, 0, r3 +; P9LE: lfiwzx f0, 0, r3 +; P9LE: xxpermdi vs0, f0, f0, 2 +; P9LE: xxspltw v2, vs0, 3 ; P9LE: blr -; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3 -; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1 +; P8BE: lfiwzx f0, 0, r3 +; P8BE: xxsldwi vs0, f0, f0, 1 +; P8BE: xxspltw v2, vs0, 0 ; P8BE: blr -; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3 -; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1 +; P8LE: lfiwzx f0, 0, r3 +; P8LE: xxpermdi vs0, f0, f0, 2 +; P8LE: xxspltw v2, vs0, 3 ; P8LE: blr } diff --git a/test/CodeGen/PowerPC/load-v4i8-improved.ll b/test/CodeGen/PowerPC/load-v4i8-improved.ll index 36f347222d5f0..f1fa29960742f 100644 --- a/test/CodeGen/PowerPC/load-v4i8-improved.ll +++ b/test/CodeGen/PowerPC/load-v4i8-improved.ll @@ -1,15 +1,27 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-LE \ ; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck \ ; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s define <16 x i8> @test(i32* %s, i32* %t) { +; CHECK-LE-LABEL: test: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-LE-NEXT: xxspltw v2, vs0, 3 +; CHECK-LE-NEXT: blr + +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-NEXT: xxspltw v2, vs0, 0 +; CHECK-NEXT: blr entry: %0 = bitcast i32* %s to <4 x i8>* %1 = load <4 x i8>, <4 x i8>* %0, align 4 %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> ret <16 x i8> %2 -; CHECK-LABEL: test -; CHECK: lxsiwax 34, 0, 3 -; CHECK: xxspltw 34, 34, 1 } diff --git a/test/CodeGen/PowerPC/power9-moves-and-splats.ll b/test/CodeGen/PowerPC/power9-moves-and-splats.ll index fc676cc0885f0..5ccba80fa4b26 100644 --- a/test/CodeGen/PowerPC/power9-moves-and-splats.ll +++ b/test/CodeGen/PowerPC/power9-moves-and-splats.ll @@ -1,47 +1,74 @@ -; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s \ -; RUN: --check-prefix=CHECK-BE +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefix=CHECK-BE @Globi = external global i32, align 4 @Globf = external global float, align 4 define <2 x i64> @test1(i64 %a, i64 %b) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mtvsrdd v2, r4, r3 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mtvsrdd v2, r3, r4 +; CHECK-BE-NEXT: blr entry: ; The FIXME below is due to the lowering for BUILD_VECTOR needing a re-vamp ; which will happen in a subsequent patch. -; CHECK-LABEL: test1 -; CHECK: mtvsrdd 34, 4, 3 -; CHECK-BE-LABEL: test1 -; CHECK-BE: mtvsrdd 34, 3, 4 %vecins = insertelement <2 x i64> undef, i64 %a, i32 0 %vecins1 = insertelement <2 x i64> %vecins, i64 %b, i32 1 ret <2 x i64> %vecins1 } define i64 @test2(<2 x i64> %a) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mfvsrld r3, v2 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mfvsrd r3, v2 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test2 -; CHECK: mfvsrld 3, 34 %0 = extractelement <2 x i64> %a, i32 0 ret i64 %0 } define i64 @test3(<2 x i64> %a) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mfvsrd r3, v2 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test3: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: mfvsrld r3, v2 +; CHECK-BE-NEXT: blr entry: -; CHECK-BE-LABEL: test3 -; CHECK-BE: mfvsrld 3, 34 %0 = extractelement <2 x i64> %a, i32 1 ret i64 %0 } define <4 x i32> @test4(i32* nocapture readonly %in) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test4 -; CHECK: lxvwsx 34, 0, 3 -; CHECK-NOT: xxspltw -; CHECK-BE-LABEL: test4 -; CHECK-BE: lxvwsx 34, 0, 3 -; CHECK-BE-NOT: xxspltw %0 = load i32, i32* %in, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -49,13 +76,20 @@ entry: } define <4 x float> @test5(float* nocapture readonly %in) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test5 -; CHECK: lxvwsx 34, 0, 3 -; CHECK-NOT: xxspltw -; CHECK-BE-LABEL: test5 -; CHECK-BE: lxvwsx 34, 0, 3 -; CHECK-BE-NOT: xxspltw %0 = load float, float* %in, align 4 %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer @@ -63,17 +97,24 @@ entry: } define <4 x i32> @test6() { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis r3, r2, .LC0@toc@ha +; CHECK-NEXT: ld r3, .LC0@toc@l(r3) +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: addis r3, r2, .LC0@toc@ha +; CHECK-BE-NEXT: ld r3, .LC0@toc@l(r3) +; CHECK-BE-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test6 -; CHECK: addis -; CHECK: ld [[TOC:[0-9]+]], .LC0 -; CHECK: lxvwsx 34, 0, 3 -; CHECK-NOT: xxspltw -; CHECK-BE-LABEL: test6 -; CHECK-BE: addis -; CHECK-BE: ld [[TOC:[0-9]+]], .LC0 -; CHECK-BE: lxvwsx 34, 0, 3 -; CHECK-BE-NOT: xxspltw %0 = load i32, i32* @Globi, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -81,17 +122,24 @@ entry: } define <4 x float> @test7() { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis r3, r2, .LC1@toc@ha +; CHECK-NEXT: ld r3, .LC1@toc@l(r3) +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test7: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: addis r3, r2, .LC1@toc@ha +; CHECK-BE-NEXT: ld r3, .LC1@toc@l(r3) +; CHECK-BE-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test7 -; CHECK: addis -; CHECK: ld [[TOC:[0-9]+]], .LC1 -; CHECK: lxvwsx 34, 0, 3 -; CHECK-NOT: xxspltw -; CHECK-BE-LABEL: test7 -; CHECK-BE: addis -; CHECK-BE: ld [[TOC:[0-9]+]], .LC1 -; CHECK-BE: lxvwsx 34, 0, 3 -; CHECK-BE-NOT: xxspltw %0 = load float, float* @Globf, align 4 %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer @@ -99,76 +147,120 @@ entry: } define <16 x i8> @test8() { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v2, v2, v2 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test8: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxlxor v2, v2, v2 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test8 -; CHECK: xxlxor 34, 34, 34 -; CHECK-BE-LABEL: test8 -; CHECK-BE: xxlxor 34, 34, 34 ret <16 x i8> zeroinitializer } define <16 x i8> @test9() { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 1 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test9: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 1 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test9 -; CHECK: xxspltib 34, 1 -; CHECK-BE-LABEL: test9 -; CHECK-BE: xxspltib 34, 1 ret <16 x i8> } define <16 x i8> @test10() { +; CHECK-LABEL: test10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 127 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test10: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 127 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test10 -; CHECK: xxspltib 34, 127 -; CHECK-BE-LABEL: test10 -; CHECK-BE: xxspltib 34, 127 ret <16 x i8> } define <16 x i8> @test11() { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 128 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test11: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 128 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test11 -; CHECK: xxspltib 34, 128 -; CHECK-BE-LABEL: test11 -; CHECK-BE: xxspltib 34, 128 ret <16 x i8> } define <16 x i8> @test12() { +; CHECK-LABEL: test12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 255 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test12: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 255 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test12 -; CHECK: xxspltib 34, 255 -; CHECK-BE-LABEL: test12 -; CHECK-BE: xxspltib 34, 255 ret <16 x i8> } define <16 x i8> @test13() { +; CHECK-LABEL: test13: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 129 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test13: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 129 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test13 -; CHECK: xxspltib 34, 129 -; CHECK-BE-LABEL: test13 -; CHECK-BE: xxspltib 34, 129 ret <16 x i8> } define <16 x i8> @test13E127() { +; CHECK-LABEL: test13E127: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxspltib v2, 200 +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test13E127: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: xxspltib v2, 200 +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test13E127 -; CHECK: xxspltib 34, 200 -; CHECK-BE-LABEL: test13E127 -; CHECK-BE: xxspltib 34, 200 ret <16 x i8> } define <4 x i32> @test14(<4 x i32> %a, i32* nocapture readonly %b) { +; CHECK-LABEL: test14: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz r3, 0(r5) +; CHECK-NEXT: mtvsrws v2, r3 +; CHECK-NEXT: addi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r5) +; CHECK-NEXT: blr + +; CHECK-BE-LABEL: test14: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lwz r3, 0(r5) +; CHECK-BE-NEXT: mtvsrws v2, r3 +; CHECK-BE-NEXT: addi r3, r3, 5 +; CHECK-BE-NEXT: stw r3, 0(r5) +; CHECK-BE-NEXT: blr entry: -; CHECK-LABEL: test14 -; CHECK: lwz [[LD:[0-9]+]], -; CHECK: mtvsrws 34, [[LD]] -; CHECK-BE-LABEL: test14 -; CHECK-BE: lwz [[LD:[0-9]+]], -; CHECK-BE: mtvsrws 34, [[LD]] %0 = load i32, i32* %b, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer diff --git a/test/CodeGen/PowerPC/pr38087.ll b/test/CodeGen/PowerPC/pr38087.ll index af8704f7d7081..2736ffa723c1a 100644 --- a/test/CodeGen/PowerPC/pr38087.ll +++ b/test/CodeGen/PowerPC/pr38087.ll @@ -11,9 +11,8 @@ declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0 define void @draw_llvm_vs_variant0() { ; CHECK-LABEL: draw_llvm_vs_variant0: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ldx r3, 0, r3 -; CHECK-NEXT: mtvsrd f0, r3 -; CHECK-NEXT: xxswapd v2, vs0 +; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: xxpermdi v2, f0, f0, 2 ; CHECK-NEXT: vmrglh v2, v2, v2 ; CHECK-NEXT: vextsh2w v2, v2 ; CHECK-NEXT: xvcvsxwsp vs0, v2 diff --git a/test/CodeGen/PowerPC/qpx-load-splat.ll b/test/CodeGen/PowerPC/qpx-load-splat.ll index 034961815178c..1afd27262ba84 100644 --- a/test/CodeGen/PowerPC/qpx-load-splat.ll +++ b/test/CodeGen/PowerPC/qpx-load-splat.ll @@ -1,35 +1,44 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s ; Function Attrs: norecurse nounwind readonly define <4 x double> @foo(double* nocapture readonly %a) #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvdsx v2, 0, r3 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr entry: %0 = load double, double* %a, align 8 %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %shuffle.i - -; CHECK-LABEL: @foo -; CHECK: lfd 1, 0(3) -; CHECK: blr } define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { +; CHECK-LABEL: foox: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 3 +; CHECK-NEXT: lxvdsx v2, r3, r4 +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr entry: %p = getelementptr double, double* %a, i64 %idx %0 = load double, double* %p, align 8 %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %shuffle.i - -; CHECK-LABEL: @foox -; CHECK: sldi [[REG1:[0-9]+]], 4, 3 -; CHECK: lfdx 1, 3, [[REG1]] -; CHECK: blr } define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { +; CHECK-LABEL: fooxu: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 3 +; CHECK-NEXT: lfdux f0, r3, r4 +; CHECK-NEXT: xxspltd v2, vs0, 0 +; CHECK-NEXT: std r3, 0(r5) +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: blr entry: %p = getelementptr double, double* %a, i64 %idx %0 = load double, double* %p, align 8 @@ -37,39 +46,36 @@ entry: %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer store double* %p, double** %pptr, align 8 ret <4 x double> %shuffle.i - -; CHECK-LABEL: @foox -; CHECK: sldi [[REG1:[0-9]+]], 4, 3 -; CHECK: lfdux 1, 3, [[REG1]] -; CHECK: std 3, 0(5) -; CHECK: blr } define <4 x float> @foof(float* nocapture readonly %a) #0 { +; CHECK-LABEL: foof: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfiwzx f0, 0, r3 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr entry: %0 = load float, float* %a, align 4 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %shuffle.i - -; CHECK-LABEL: @foof -; CHECK: lfs 1, 0(3) -; CHECK: blr } define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { +; CHECK-LABEL: foofx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 2 +; CHECK-NEXT: lfiwzx f0, r3, r4 +; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: blr entry: %p = getelementptr float, float* %a, i64 %idx %0 = load float, float* %p, align 4 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %shuffle.i - -; CHECK-LABEL: @foofx -; CHECK: sldi [[REG1:[0-9]+]], 4, 2 -; CHECK: lfsx 1, 3, [[REG1]] -; CHECK: blr } -attributes #0 = { norecurse nounwind readonly "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" } diff --git a/test/CodeGen/PowerPC/scalar_vector_test_1.ll b/test/CodeGen/PowerPC/scalar_vector_test_1.ll new file mode 100644 index 0000000000000..1b5ddff0374c6 --- /dev/null +++ b/test/CodeGen/PowerPC/scalar_vector_test_1.ll @@ -0,0 +1,292 @@ +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8BE + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test1(i64* nocapture readonly %int64, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test1: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 0(r3) +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test1: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 0(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr +entry: + %0 = load i64, i64* %int64, align 8 + %vecins = insertelement <2 x i64> %vec, i64 %0, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test2(i64* nocapture readonly %int64, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test2: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 8(r3) +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test2: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 8(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i64, i64* %int64, i64 1 + %0 = load i64, i64* %arrayidx, align 8 + %vecins = insertelement <2 x i64> %vec, i64 %0, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test3(i64* nocapture readonly %int64, <2 x i64> %vec, i32 signext %Idx) { +; P9LE-LABEL: s2v_test3: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: sldi r4, r7, 3 +; P9LE-NEXT: lfdx f0, r3, r4 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test3 +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: sldi r4, r7, 3 +; P9BE-NEXT: lfdx f0, r3, r4 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr +entry: + %idxprom = sext i32 %Idx to i64 + %arrayidx = getelementptr inbounds i64, i64* %int64, i64 %idxprom + %0 = load i64, i64* %arrayidx, align 8 + %vecins = insertelement <2 x i64> %vec, i64 %0, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test4(i64* nocapture readonly %int64, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test4: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 8(r3) +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test4: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 8(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i64, i64* %int64, i64 1 + %0 = load i64, i64* %arrayidx, align 8 + %vecins = insertelement <2 x i64> %vec, i64 %0, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test5(<2 x i64> %vec, i64* nocapture readonly %ptr1) { +; P9LE-LABEL: s2v_test5: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 0(r5) +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test5: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 0(r5) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr +entry: + %0 = load i64, i64* %ptr1, align 8 + %vecins = insertelement <2 x i64> %vec, i64 %0, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x double> @s2v_test_f1(double* nocapture readonly %f64, <2 x double> %vec) { +; P9LE-LABEL: s2v_test_f1: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 0(r3) +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f1: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 0(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f1: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfdx f0, 0, r3 +; P8LE-NEXT: xxspltd vs0, vs0, 0 +; P8LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f1: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfdx f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %0 = load double, double* %f64, align 8 + %vecins = insertelement <2 x double> %vec, double %0, i32 0 + ret <2 x double> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x double> @s2v_test_f2(double* nocapture readonly %f64, <2 x double> %vec) { +; P9LE-LABEL: s2v_test_f2: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 8(r3) +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f2: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 8(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f2: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 8 +; P8LE-NEXT: lfdx f0, 0, r3 +; P8LE-NEXT: xxspltd vs0, vs0, 0 +; P8LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f2: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 8 +; P8BE-NEXT: lfdx f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds double, double* %f64, i64 1 + %0 = load double, double* %arrayidx, align 8 + %vecins = insertelement <2 x double> %vec, double %0, i32 0 + ret <2 x double> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x double> @s2v_test_f3(double* nocapture readonly %f64, <2 x double> %vec, i32 signext %Idx) { +; P9LE-LABEL: s2v_test_f3: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: sldi r4, r7, 3 +; P9LE-NEXT: lfdx f0, r3, r4 +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f3: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: sldi r4, r7, 3 +; P9BE-NEXT: lfdx f0, r3, r4 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f3: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: sldi r4, r7, 3 +; P8LE-NEXT: lfdx f0, r3, r4 +; P8LE-NEXT: xxspltd vs0, vs0, 0 +; P8LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f3: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: sldi r4, r7, 3 +; P8BE-NEXT: lfdx f0, r3, r4 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %idxprom = sext i32 %Idx to i64 + %arrayidx = getelementptr inbounds double, double* %f64, i64 %idxprom + %0 = load double, double* %arrayidx, align 8 + %vecins = insertelement <2 x double> %vec, double %0, i32 0 + ret <2 x double> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x double> @s2v_test_f4(double* nocapture readonly %f64, <2 x double> %vec) { +; P9LE-LABEL: s2v_test_f4: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 8(r3) +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f4: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 8(r3) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f4: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 8 +; P8LE-NEXT: lfdx f0, 0, r3 +; P8LE-NEXT: xxspltd vs0, vs0, 0 +; P8LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f4: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 8 +; P8BE-NEXT: lfdx f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds double, double* %f64, i64 1 + %0 = load double, double* %arrayidx, align 8 + %vecins = insertelement <2 x double> %vec, double %0, i32 0 + ret <2 x double> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x double> @s2v_test_f5(<2 x double> %vec, double* nocapture readonly %ptr1) { +; P9LE-LABEL: s2v_test_f5: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfd f0, 0(r5) +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f5: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfd f0, 0(r5) +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f5: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfdx f0, 0, r5 +; P8LE-NEXT: xxspltd vs0, vs0, 0 +; P8LE-NEXT: xxpermdi v2, v2, vs0, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f5: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfdx f0, 0, r5 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %0 = load double, double* %ptr1, align 8 + %vecins = insertelement <2 x double> %vec, double %0, i32 0 + ret <2 x double> %vecins +} + diff --git a/test/CodeGen/PowerPC/scalar_vector_test_2.ll b/test/CodeGen/PowerPC/scalar_vector_test_2.ll new file mode 100644 index 0000000000000..da1b8bcaa3d02 --- /dev/null +++ b/test/CodeGen/PowerPC/scalar_vector_test_2.ll @@ -0,0 +1,118 @@ +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8BE + +define void @test_liwzx1(<1 x float>* %A, <1 x float>* %B, <1 x float>* %C) { +; P9LE-LABEL: test_liwzx1: +; P9LE: # %bb.0: +; P9LE-NEXT: lfiwzx f0, 0, r3 +; P9LE-NEXT: lfiwzx f1, 0, r4 +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi vs1, f1, f1, 2 +; P9LE-NEXT: xvaddsp vs0, vs0, vs1 +; P9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; P9LE-NEXT: xscvspdpn f0, vs0 +; P9LE-NEXT: stfs f0, 0(r5) +; P9LE-NEXT: blr + +; P9BE-LABEL: test_liwzx1: +; P9BE: # %bb.0: +; P9BE-NEXT: lfiwzx f0, 0, r3 +; P9BE-NEXT: lfiwzx f1, 0, r4 +; P9BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P9BE-NEXT: xxsldwi vs1, f1, f1, 1 +; P9BE-NEXT: xvaddsp vs0, vs0, vs1 +; P9BE-NEXT: xscvspdpn f0, vs0 +; P9BE-NEXT: stfs f0, 0(r5) +; P9BE-NEXT: blr + +; P8LE-LABEL: test_liwzx1: +; P8LE: # %bb.0: +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: lfiwzx f1, 0, r4 +; P8LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P8LE-NEXT: xxpermdi vs1, f1, f1, 2 +; P8LE-NEXT: xvaddsp vs0, vs0, vs1 +; P8LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; P8LE-NEXT: xscvspdpn f0, vs0 +; P8LE-NEXT: stfsx f0, 0, r5 +; P8LE-NEXT: blr + +; P8BE-LABEL: test_liwzx1: +; P8BE: # %bb.0: +; P8BE-NEXT: lfiwzx f0, 0, r3 +; P8BE-NEXT: lfiwzx f1, 0, r4 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE-NEXT: xxsldwi vs1, f1, f1, 1 +; P8BE-NEXT: xvaddsp vs0, vs0, vs1 +; P8BE-NEXT: xscvspdpn f0, vs0 +; P8BE-NEXT: stfsx f0, 0, r5 +; P8BE-NEXT: blr + %a = load <1 x float>, <1 x float>* %A + %b = load <1 x float>, <1 x float>* %B + %X = fadd <1 x float> %a, %b + store <1 x float> %X, <1 x float>* %C + ret void +} + +define <1 x float>* @test_liwzx2(<1 x float>* %A, <1 x float>* %B, <1 x float>* %C) { +; P9LE-LABEL: test_liwzx2: +; P9LE: # %bb.0: +; P9LE-NEXT: lfiwzx f0, 0, r3 +; P9LE-NEXT: lfiwzx f1, 0, r4 +; P9LE-NEXT: mr r3, r5 +; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P9LE-NEXT: xxpermdi vs1, f1, f1, 2 +; P9LE-NEXT: xvsubsp vs0, vs0, vs1 +; P9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; P9LE-NEXT: xscvspdpn f0, vs0 +; P9LE-NEXT: stfs f0, 0(r5) +; P9LE-NEXT: blr + +; P9BE-LABEL: test_liwzx2: +; P9BE: # %bb.0: +; P9BE-NEXT: lfiwzx f0, 0, r3 +; P9BE-NEXT: lfiwzx f1, 0, r4 +; P9BE-NEXT: mr r3, r5 +; P9BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P9BE-NEXT: xxsldwi vs1, f1, f1, 1 +; P9BE-NEXT: xvsubsp vs0, vs0, vs1 +; P9BE-NEXT: xscvspdpn f0, vs0 +; P9BE-NEXT: stfs f0, 0(r5) +; P9BE-NEXT: blr + +; P8LE-LABEL: test_liwzx2: +; P8LE: # %bb.0: +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: lfiwzx f1, 0, r4 +; P8LE-NEXT: mr r3, r5 +; P8LE-NEXT: xxpermdi vs0, f0, f0, 2 +; P8LE-NEXT: xxpermdi vs1, f1, f1, 2 +; P8LE-NEXT: xvsubsp vs0, vs0, vs1 +; P8LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; P8LE-NEXT: xscvspdpn f0, vs0 +; P8LE-NEXT: stfsx f0, 0, r5 +; P8LE-NEXT: blr + +; P8BE-LABEL: test_liwzx2: +; P8BE: # %bb.0: +; P8BE-NEXT: lfiwzx f0, 0, r3 +; P8BE-NEXT: lfiwzx f1, 0, r4 +; P8BE-NEXT: mr r3, r5 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE-NEXT: xxsldwi vs1, f1, f1, 1 +; P8BE-NEXT: xvsubsp vs0, vs0, vs1 +; P8BE-NEXT: xscvspdpn f0, vs0 +; P8BE-NEXT: stfsx f0, 0, r5 +; P8BE-NEXT: blr + %a = load <1 x float>, <1 x float>* %A + %b = load <1 x float>, <1 x float>* %B + %X = fsub <1 x float> %a, %b + store <1 x float> %X, <1 x float>* %C + ret <1 x float>* %C +} diff --git a/test/CodeGen/PowerPC/scalar_vector_test_3.ll b/test/CodeGen/PowerPC/scalar_vector_test_3.ll new file mode 100644 index 0000000000000..c63044a79a5aa --- /dev/null +++ b/test/CodeGen/PowerPC/scalar_vector_test_3.ll @@ -0,0 +1,265 @@ +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8BE + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test1(i32* nocapture readonly %int32, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test1: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfiwax f0, 0, r3 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test1: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfiwax f0, 0, r3 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test1: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwax f0, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: xxpermdi v2, v2, v3, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test1: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfiwax f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %int32, align 4 + %conv = sext i32 %0 to i64 + %vecins = insertelement <2 x i64> %vec, i64 %conv, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test2(i32* nocapture readonly %int32, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test2: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addi r3, r3, 4 +; P9LE-NEXT: lfiwax f0, 0, r3 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test2: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: addi r3, r3, 4 +; P9BE-NEXT: lfiwax f0, 0, r3 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test2: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: lfiwax f0, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: xxpermdi v2, v2, v3, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test2: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 4 +; P8BE-NEXT: lfiwax f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 + %0 = load i32, i32* %arrayidx, align 4 + %conv = sext i32 %0 to i64 + %vecins = insertelement <2 x i64> %vec, i64 %conv, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test3(i32* nocapture readonly %int32, <2 x i64> %vec, i32 signext %Idx) { +; P9LE-LABEL: s2v_test3: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: sldi r4, r7, 2 +; P9LE-NEXT: lfiwax f0, r3, r4 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test3: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: sldi r4, r7, 2 +; P9BE-NEXT: lfiwax f0, r3, r4 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test3: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: sldi r4, r7, 2 +; P8LE-NEXT: lfiwax f0, r3, r4 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: xxpermdi v2, v2, v3, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test3: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: sldi r4, r7, 2 +; P8BE-NEXT: lfiwax f0, r3, r4 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %idxprom = sext i32 %Idx to i64 + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %conv = sext i32 %0 to i64 + %vecins = insertelement <2 x i64> %vec, i64 %conv, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test4(i32* nocapture readonly %int32, <2 x i64> %vec) { +; P9LE-LABEL: s2v_test4: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addi r3, r3, 4 +; P9LE-NEXT: lfiwax f0, 0, r3 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test4: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: addi r3, r3, 4 +; P9BE-NEXT: lfiwax f0, 0, r3 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test4: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: lfiwax f0, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: xxpermdi v2, v2, v3, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test4: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 4 +; P8BE-NEXT: lfiwax f0, 0, r3 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 + %0 = load i32, i32* %arrayidx, align 4 + %conv = sext i32 %0 to i64 + %vecins = insertelement <2 x i64> %vec, i64 %conv, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test5(<2 x i64> %vec, i32* nocapture readonly %ptr1) { +; P9LE-LABEL: s2v_test5: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfiwax f0, 0, r5 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: xxpermdi v2, v2, v3, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test5: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfiwax f0, 0, r5 +; P9BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test5: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwax f0, 0, r5 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: xxpermdi v2, v2, v3, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test5: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfiwax f0, 0, r5 +; P8BE-NEXT: xxpermdi v2, vs0, v2, 1 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %ptr1, align 4 + %conv = sext i32 %0 to i64 + %vecins = insertelement <2 x i64> %vec, i64 %conv, i32 0 + ret <2 x i64> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test6(i32* nocapture readonly %ptr) { +; P9LE-LABEL: s2v_test6: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfiwax f0, 0, r3 +; P9LE-NEXT: xxpermdi v2, f0, f0, 2 +; P9LE-NEXT: xxspltd v2, v2, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test6: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfiwax f0, 0, r3 +; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test6: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwax f0, 0, r3 +; P8LE-NEXT: xxpermdi v2, f0, f0, 2 +; P8LE-NEXT: xxspltd v2, v2, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test6: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfiwax f0, 0, r3 +; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %ptr, align 4 + %conv = sext i32 %0 to i64 + %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 + %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %splat.splat +} + +; Function Attrs: norecurse nounwind readonly +define <2 x i64> @s2v_test7(i32* nocapture readonly %ptr) { +; P9LE-LABEL: s2v_test7: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfiwax f0, 0, r3 +; P9LE-NEXT: xxpermdi v2, f0, f0, 2 +; P9LE-NEXT: xxspltd v2, v2, 1 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test7: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: lfiwax f0, 0, r3 +; P9BE-NEXT: xxspltd v2, vs0, 0 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test7: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwax f0, 0, r3 +; P8LE-NEXT: xxpermdi v2, f0, f0, 2 +; P8LE-NEXT: xxspltd v2, v2, 1 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test7: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfiwax f0, 0, r3 +; P8BE-NEXT: xxspltd v2, vs0, 0 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %ptr, align 4 + %conv = sext i32 %0 to i64 + %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0 + %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %splat.splat +} + diff --git a/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/test/CodeGen/PowerPC/scalar_vector_test_4.ll new file mode 100644 index 0000000000000..aaaf0ba60f1db --- /dev/null +++ b/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -0,0 +1,341 @@ +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=P8BE + +; Function Attrs: norecurse nounwind readonly +define <4 x i32> @s2v_test1(i32* nocapture readonly %int32, <4 x i32> %vec) { +; P8LE-LABEL: s2v_test1: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; P8LE-NEXT: addi r3, r4, .LCPI0_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test1: +; P8BE: # %bb.0: # %entry +; P8BE: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %int32, align 4 + %vecins = insertelement <4 x i32> %vec, i32 %0, i32 0 + ret <4 x i32> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <4 x i32> @s2v_test2(i32* nocapture readonly %int32, <4 x i32> %vec) { +; P8LE-LABEL: s2v_test2: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: addi r3, r4, .LCPI1_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test2: +; P8BE: # %bb.0: # %entry +; P8BE: addi r3, r3, 4 +; P8BE: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 + %0 = load i32, i32* %arrayidx, align 4 + %vecins = insertelement <4 x i32> %vec, i32 %0, i32 0 + ret <4 x i32> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <4 x i32> @s2v_test3(i32* nocapture readonly %int32, <4 x i32> %vec, i32 signext %Idx) { +; P8LE-LABEL: s2v_test3: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: sldi r5, r7, 2 +; P8LE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; P8LE-NEXT: lfiwzx f0, r3, r5 +; P8LE-NEXT: addi r3, r4, .LCPI2_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test3: +; P8BE: # %bb.0: # %entry +; P8BE: sldi r4, r7, 2 +; P8BE: lfiwzx f0, r3, r4 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %idxprom = sext i32 %Idx to i64 + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %vecins = insertelement <4 x i32> %vec, i32 %0, i32 0 + ret <4 x i32> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <4 x i32> @s2v_test4(i32* nocapture readonly %int32, <4 x i32> %vec) { +; P8LE-LABEL: s2v_test4: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: addi r3, r4, .LCPI3_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test4: +; P8BE: # %bb.0: # %entry +; P8BE: addi r3, r3, 4 +; P8BE: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %int32, i64 1 + %0 = load i32, i32* %arrayidx, align 4 + %vecins = insertelement <4 x i32> %vec, i32 %0, i32 0 + ret <4 x i32> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <4 x i32> @s2v_test5(<4 x i32> %vec, i32* nocapture readonly %ptr1) { +; P8LE-LABEL: s2v_test5: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwzx f0, 0, r5 +; P8LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test5: +; P8BE: # %bb.0: # %entry +; P8BE: lfiwzx f0, 0, r5 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %0 = load i32, i32* %ptr1, align 4 + %vecins = insertelement <4 x i32> %vec, i32 %0, i32 0 + ret <4 x i32> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <4 x float> @s2v_test_f1(float* nocapture readonly %f64, <4 x float> %vec) { +; P8LE-LABEL: s2v_test_f1: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; P8LE-NEXT: addi r3, r4, .LCPI5_0@toc@l +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f1: +; P8BE: # %bb.0: # %entry +; P8BE: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi vs0, f0, f0, 1 +; P8BE: xxsldwi vs0, v2, vs0, 1 +; P8BE: xxsldwi v2, vs0, vs0, 3 +; P8BE-NEXT: blr +entry: + %0 = load float, float* %f64, align 4 + %vecins = insertelement <4 x float> %vec, float %0, i32 0 + ret <4 x float> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x float> @s2v_test_f2(float* nocapture readonly %f64, <2 x float> %vec) { +; P9LE-LABEL: s2v_test_f2: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addi r3, r3, 4 +; P9LE-NEXT: xxspltw v2, v2, 2 +; P9LE-NEXT: lfiwzx f0, 0, r3 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f2: +; P9BE: # %bb.0: # %entry +; P9BE: addi r3, r3, 4 +; P9BE: xxspltw v2, v2, 1 +; P9BE: lfiwzx f0, 0, r3 +; P9BE-NEXT: xxsldwi v3, f0, f0, 1 +; P9BE: vmrghw v2, v3, v2 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f2: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: xxspltw v2, v2, 2 +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f2: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 4 +; P8BE-NEXT: xxspltw v2, v2, 1 +; P8BE-NEXT: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi v3, f0, f0, 1 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds float, float* %f64, i64 1 + %0 = load float, float* %arrayidx, align 8 + %vecins = insertelement <2 x float> %vec, float %0, i32 0 + ret <2 x float> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x float> @s2v_test_f3(float* nocapture readonly %f64, <2 x float> %vec, i32 signext %Idx) { +; P9LE-LABEL: s2v_test_f3: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: sldi r4, r7, 2 +; P9LE-NEXT: xxspltw v2, v2, 2 +; P9LE-NEXT: lfiwzx f0, r3, r4 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f3: +; P9BE: # %bb.0: # %entry +; P9BE: sldi r4, r7, 2 +; P9BE: xxspltw v2, v2, 1 +; P9BE: lfiwzx f0, r3, r4 +; P9BE-NEXT: xxsldwi v3, f0, f0, 1 +; P9BE: vmrghw v2, v3, v2 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f3: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: sldi r4, r7, 2 +; P8LE-NEXT: xxspltw v2, v2, 2 +; P8LE-NEXT: lfiwzx f0, r3, r4 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f3: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: sldi r4, r7, 2 +; P8BE-NEXT: xxspltw v2, v2, 1 +; P8BE-NEXT: lfiwzx f0, r3, r4 +; P8BE-NEXT: xxsldwi v3, f0, f0, 1 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr +entry: + %idxprom = sext i32 %Idx to i64 + %arrayidx = getelementptr inbounds float, float* %f64, i64 %idxprom + %0 = load float, float* %arrayidx, align 8 + %vecins = insertelement <2 x float> %vec, float %0, i32 0 + ret <2 x float> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x float> @s2v_test_f4(float* nocapture readonly %f64, <2 x float> %vec) { +; P9LE-LABEL: s2v_test_f4: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addi r3, r3, 4 +; P9LE-NEXT: xxspltw v2, v2, 2 +; P9LE-NEXT: lfiwzx f0, 0, r3 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f4: +; P9BE: # %bb.0: # %entry +; P9BE: addi r3, r3, 4 +; P9BE: xxspltw v2, v2, 1 +; P9BE: lfiwzx f0, 0, r3 +; P9BE-NEXT: xxsldwi v3, f0, f0, 1 +; P9BE: vmrghw v2, v3, v2 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f4: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addi r3, r3, 4 +; P8LE-NEXT: xxspltw v2, v2, 2 +; P8LE-NEXT: lfiwzx f0, 0, r3 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f4: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: addi r3, r3, 4 +; P8BE-NEXT: xxspltw v2, v2, 1 +; P8BE-NEXT: lfiwzx f0, 0, r3 +; P8BE-NEXT: xxsldwi v3, f0, f0, 1 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr +entry: + %arrayidx = getelementptr inbounds float, float* %f64, i64 1 + %0 = load float, float* %arrayidx, align 8 + %vecins = insertelement <2 x float> %vec, float %0, i32 0 + ret <2 x float> %vecins +} + +; Function Attrs: norecurse nounwind readonly +define <2 x float> @s2v_test_f5(<2 x float> %vec, float* nocapture readonly %ptr1) { +; P9LE-LABEL: s2v_test_f5: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: lfiwzx f0, 0, r5 +; P9LE-NEXT: xxspltw v2, v2, 2 +; P9LE-NEXT: xxpermdi v3, f0, f0, 2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr + +; P9BE-LABEL: s2v_test_f5: +; P9BE: # %bb.0: # %entry +; P9BE: lfiwzx f0, 0, r5 +; P9BE: xxspltw v2, v2, 1 +; P9BE-NEXT: xxsldwi v3, f0, f0, 1 +; P9BE: vmrghw v2, v3, v2 +; P9BE-NEXT: blr + +; P8LE-LABEL: s2v_test_f5: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: lfiwzx f0, 0, r5 +; P8LE-NEXT: xxspltw v2, v2, 2 +; P8LE-NEXT: xxpermdi v3, f0, f0, 2 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr + +; P8BE-LABEL: s2v_test_f5: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: lfiwzx f0, 0, r5 +; P8BE-NEXT: xxspltw v2, v2, 1 +; P8BE-NEXT: xxsldwi v3, f0, f0, 1 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr +entry: + %0 = load float, float* %ptr1, align 8 + %vecins = insertelement <2 x float> %vec, float %0, i32 0 + ret <2 x float> %vecins +} + diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll index 82c240e46d264..ac0bcc740681d 100644 --- a/test/CodeGen/PowerPC/swaps-le-6.ll +++ b/test/CodeGen/PowerPC/swaps-le-6.ll @@ -1,12 +1,15 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr8 \ -; RUN: -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -O3 < %s | FileCheck %s ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-P9 \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: < %s | FileCheck %s --check-prefix=CHECK-P9 \ ; RUN: --implicit-check-not xxswapd ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 \ -; RUN: -verify-machineinstrs -mattr=-power9-vector < %s | FileCheck %s +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \ +; RUN: -mattr=-power9-vector < %s | FileCheck %s ; These tests verify that VSX swap optimization works when loading a scalar ; into a vector register. @@ -17,6 +20,31 @@ @y = global double 1.780000e+00, align 8 define void @bar0() { +; CHECK-LABEL: bar0: +; CHECK: # %bb.0: # %entry +; CHECK: addis r3, r2, .LC0@toc@ha +; CHECK: addis r4, r2, .LC1@toc@ha +; CHECK: ld r3, .LC0@toc@l(r3) +; CHECK: addis r3, r2, .LC2@toc@ha +; CHECK: ld r3, .LC2@toc@l(r3) +; CHECK: xxpermdi vs0, vs0, vs1, 1 +; CHECK: stxvd2x vs0, 0, r3 +; CHECK: blr +; +; CHECK-P9-LABEL: bar0: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: addis r3, r2, .LC0@toc@ha +; CHECK-P9: addis r4, r2, .LC1@toc@ha +; CHECK-P9: ld r3, .LC0@toc@l(r3) +; CHECK-P9: ld r4, .LC1@toc@l(r4) +; CHECK-P9: lfd f0, 0(r3) +; CHECK-P9: lxvx vs1, 0, r4 +; CHECK-P9: addis r3, r2, .LC2@toc@ha +; CHECK-P9: ld r3, .LC2@toc@l(r3) +; CHECK-P9: xxpermdi vs0, f0, f0, 2 +; CHECK-P9: xxpermdi vs0, vs1, vs0, 1 +; CHECK-P9: stxvx vs0, 0, r3 +; CHECK-P9: blr entry: %0 = load <2 x double>, <2 x double>* @x, align 16 %1 = load double, double* @y, align 8 @@ -25,21 +53,32 @@ entry: ret void } -; CHECK-LABEL: @bar0 -; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] -; CHECK-DAG: lfdx [[REG2:[0-9]+]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1 -; CHECK: stxvd2x [[REG5]] - -; CHECK-P9-LABEL: @bar0 -; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] -; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) -; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1 -; CHECK-P9: stxvx [[REG5]] - define void @bar1() { +; CHECK-LABEL: bar1: +; CHECK: # %bb.0: # %entry +; CHECK: addis r3, r2, .LC0@toc@ha +; CHECK: addis r4, r2, .LC1@toc@ha +; CHECK: ld r3, .LC0@toc@l(r3) +; CHECK: addis r3, r2, .LC2@toc@ha +; CHECK: ld r3, .LC2@toc@l(r3) +; CHECK: xxmrghd vs0, vs1, vs0 +; CHECK: stxvd2x vs0, 0, r3 +; CHECK: blr +; +; CHECK-P9-LABEL: bar1: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9: addis r3, r2, .LC0@toc@ha +; CHECK-P9: addis r4, r2, .LC1@toc@ha +; CHECK-P9: ld r3, .LC0@toc@l(r3) +; CHECK-P9: ld r4, .LC1@toc@l(r4) +; CHECK-P9: lfd f0, 0(r3) +; CHECK-P9: lxvx vs1, 0, r4 +; CHECK-P9: addis r3, r2, .LC2@toc@ha +; CHECK-P9: ld r3, .LC2@toc@l(r3) +; CHECK-P9: xxpermdi vs0, f0, f0, 2 +; CHECK-P9: xxmrgld vs0, vs0, vs1 +; CHECK-P9: stxvx vs0, 0, r3 +; CHECK-P9: blr entry: %0 = load <2 x double>, <2 x double>* @x, align 16 %1 = load double, double* @y, align 8 @@ -48,17 +87,3 @@ entry: ret void } -; CHECK-LABEL: @bar1 -; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] -; CHECK-DAG: lfdx [[REG2:[0-9]+]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]] -; CHECK: stxvd2x [[REG5]] - -; CHECK-P9-LABEL: @bar1 -; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] -; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) -; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 -; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]] -; CHECK-P9: stxvx [[REG5]] - diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll index df4cf357e2e0d..ef7d8f3500756 100644 --- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll +++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll @@ -1,74 +1,125 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mattr=+vsx \ -; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mattr=+vsx -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ +; RUN: | FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mattr=-power9-vector \ -; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mattr=-power9-vector -ppc-vsr-nums-as-vr \ +; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ +; RUN: | FileCheck --check-prefix=CHECK-P9-VECTOR %s -; RUN: llc -verify-machineinstrs -mcpu=pwr9 \ +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s \ ; RUN: --check-prefix=CHECK-P9 --implicit-check-not xxswapd define <2 x double> @testi0(<2 x double>* %p1, double* %p2) { +; CHECK-LABEL: testi0: +; CHECK: # %bb.0: +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: lfdx f1, 0, r4 +; CHECK-NEXT: xxswapd vs0, vs0 +; CHECK-NEXT: xxspltd vs1, vs1, 0 +; CHECK-NEXT: xxpermdi v2, vs0, vs1, 1 +; CHECK-NEXT: blr +; +; CHECK-P9-VECTOR-LABEL: testi0: +; CHECK-P9-VECTOR: # %bb.0: +; CHECK-P9-VECTOR-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P9-VECTOR-NEXT: lfdx f1, 0, r4 +; CHECK-P9-VECTOR-NEXT: xxspltd vs1, vs1, 0 +; CHECK-P9-VECTOR-NEXT: xxswapd vs0, vs0 +; CHECK-P9-VECTOR-NEXT: xxpermdi v2, vs0, vs1, 1 +; CHECK-P9-VECTOR-NEXT: blr +; +; CHECK-P9-LABEL: testi0: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: lfd f0, 0(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-P9-NEXT: xxpermdi v2, vs1, vs0, 1 +; CHECK-P9-NEXT: blr %v = load <2 x double>, <2 x double>* %p1 %s = load double, double* %p2 %r = insertelement <2 x double> %v, double %s, i32 0 ret <2 x double> %r -; CHECK-LABEL: testi0 -; CHECK: lxvd2x 0, 0, 3 -; CHECK: lfdx 1, 0, 4 -; CHECK-DAG: xxspltd 1, 1, 0 -; CHECK-DAG: xxswapd 0, 0 -; CHECK: xxpermdi 34, 0, 1, 1 -; CHECK-P9-LABEL: testi0 -; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4) -; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3) -; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0 -; CHECK-P9: xxpermdi 34, [[REG2]], [[REG3]], 1 } define <2 x double> @testi1(<2 x double>* %p1, double* %p2) { +; CHECK-LABEL: testi1: +; CHECK: # %bb.0: +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: lfdx f1, 0, r4 +; CHECK-NEXT: xxswapd vs0, vs0 +; CHECK-NEXT: xxspltd vs1, vs1, 0 +; CHECK-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-NEXT: blr +; +; CHECK-P9-VECTOR-LABEL: testi1: +; CHECK-P9-VECTOR: # %bb.0: +; CHECK-P9-VECTOR-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P9-VECTOR-NEXT: lfdx f1, 0, r4 +; CHECK-P9-VECTOR-NEXT: xxspltd vs1, vs1, 0 +; CHECK-P9-VECTOR-NEXT: xxswapd vs0, vs0 +; CHECK-P9-VECTOR-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-P9-VECTOR-NEXT: blr +; +; CHECK-P9-LABEL: testi1: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: lfd f0, 0(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: xxpermdi vs0, f0, f0, 2 +; CHECK-P9-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-P9-NEXT: blr %v = load <2 x double>, <2 x double>* %p1 %s = load double, double* %p2 %r = insertelement <2 x double> %v, double %s, i32 1 ret <2 x double> %r -; CHECK-LABEL: testi1 -; CHECK: lxvd2x 0, 0, 3 -; CHECK: lfdx 1, 0, 4 -; CHECK-DAG: xxspltd 1, 1, 0 -; CHECK-DAG: xxswapd 0, 0 -; CHECK: xxmrgld 34, 1, 0 -; CHECK-P9-LABEL: testi1 -; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4) -; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3) -; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0 -; CHECK-P9: xxmrgld 34, [[REG3]], [[REG2]] } define double @teste0(<2 x double>* %p1) { +; CHECK-LABEL: teste0: +; CHECK: # %bb.0: +; CHECK-NEXT: lxvd2x vs1, 0, r3 +; CHECK: blr +; +; CHECK-P9-VECTOR-LABEL: teste0: +; CHECK-P9-VECTOR: # %bb.0: +; CHECK-P9-VECTOR-NEXT: lxvd2x vs1, 0, r3 +; CHECK-P9-VECTOR: blr +; +; CHECK-P9-LABEL: teste0: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: lfd f1, 0(r3) +; CHECK-P9-NEXT: blr %v = load <2 x double>, <2 x double>* %p1 %r = extractelement <2 x double> %v, i32 0 ret double %r -; CHECK-LABEL: teste0 -; CHECK: lxvd2x 1, 0, 3 -; CHECK-P9-LABEL: teste0 -; CHECK-P9: lfd 1, 0(3) } define double @teste1(<2 x double>* %p1) { +; CHECK-LABEL: teste1: +; CHECK: # %bb.0: +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: xxswapd vs1, vs0 +; CHECK: blr +; +; CHECK-P9-VECTOR-LABEL: teste1: +; CHECK-P9-VECTOR: # %bb.0: +; CHECK-P9-VECTOR-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P9-VECTOR-NEXT: xxswapd vs1, vs0 +; CHECK-P9-VECTOR: blr +; +; CHECK-P9-LABEL: teste1: +; CHECK-P9: # %bb.0: +; CHECK-P9-NEXT: lfd f1, 8(r3) +; CHECK-P9-NEXT: blr %v = load <2 x double>, <2 x double>* %p1 %r = extractelement <2 x double> %v, i32 1 ret double %r -; CHECK-LABEL: teste1 -; CHECK: lxvd2x 0, 0, 3 -; CHECK: xxswapd 1, 0 -; CHECK-P9-LABEL: teste1 -; CHECK-P9: lfd 1, 8(3) } From 08f7943cf05b989e05297fea48e2464f5af11bf1 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 30 Nov 2018 17:55:00 +0000 Subject: [PATCH 14/21] Merging r344589: ------------------------------------------------------------------------ r344589 | dstenb | 2018-10-16 01:06:48 -0700 (Tue, 16 Oct 2018) | 41 lines [DebugInfo][LCSSA] Rewrite pre-existing debug values outside loop Summary: Extend LCSSA so that debug values outside loops are rewritten to use the PHI nodes that the pass creates. This fixes PR39019. In that case, we ran LCSSA on a loop that was later on vectorized, which left us with something like this: for.cond.cleanup: %add.lcssa = phi i32 [ %add, %for.body ], [ %34, %middle.block ] call void @llvm.dbg.value(metadata i32 %add, ret i32 %add.lcssa for.body: %add = [...] br i1 %exitcond, label %for.cond.cleanup, label %for.body which later resulted in the debug.value becoming undef when removing the scalar loop (and the location would have probably been wrong for the vectorized case otherwise). As we now may need to query the AvailableVals cache more than once for a basic block, FindAvailableVals() in SSAUpdaterImpl is changed so that it updates the cache for blocks that we do not create a PHI node for, regardless of the block's number of predecessors. The debug value in the attached IR reproducer would not be properly rewritten without this. Debug values residing in blocks where we have not inserted any PHI nodes are currently left as-is by this patch. I'm not sure what should be done with those uses. Reviewers: mattd, aprantl, vsk, probinson Reviewed By: mattd, aprantl Subscribers: jmorse, gbedwell, JDevlieghere, llvm-commits Differential Revision: https://reviews.llvm.org/D53130 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348011 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/SSAUpdater.h | 4 ++ .../llvm/Transforms/Utils/SSAUpdaterImpl.h | 7 +- lib/Transforms/Utils/LCSSA.cpp | 16 +++++ lib/Transforms/Utils/SSAUpdater.cpp | 5 ++ .../LCSSA/rewrite-existing-dbg-values.ll | 69 +++++++++++++++++++ 5 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 test/Transforms/LCSSA/rewrite-existing-dbg-values.ll diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h index 4a79116629902..d02607acbbb57 100644 --- a/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/include/llvm/Transforms/Utils/SSAUpdater.h @@ -76,6 +76,10 @@ class SSAUpdater { /// block. bool HasValueForBlock(BasicBlock *BB) const; + /// Return the value for the specified block if the SSAUpdater has one, + /// otherwise return nullptr. + Value *FindValueForBlock(BasicBlock *BB) const; + /// Construct SSA form, materializing a value that is live at the end /// of the specified block. Value *GetValueAtEndOfBlock(BasicBlock *BB); diff --git a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index b7649ba883344..cab0f3e715757 100644 --- a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -357,10 +357,9 @@ class SSAUpdaterImpl { BBInfo *Info = *I; if (Info->DefBB != Info) { - // Record the available value at join nodes to speed up subsequent - // uses of this SSAUpdater for the same value. - if (Info->NumPreds > 1) - (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal; + // Record the available value to speed up subsequent uses of this + // SSAUpdater for the same value. + (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal; continue; } diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index a1f8e7484bcf9..53d444b309d5d 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" @@ -201,6 +202,21 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, SSAUpdate.RewriteUse(*UseToRewrite); } + SmallVector DbgValues; + llvm::findDbgValues(DbgValues, I); + + // Update pre-existing debug value uses that reside outside the loop. + auto &Ctx = I->getContext(); + for (auto DVI : DbgValues) { + BasicBlock *UserBB = DVI->getParent(); + if (InstBB == UserBB || L->contains(UserBB)) + continue; + // We currently only handle debug values residing in blocks where we have + // inserted a PHI instruction. + if (Value *V = SSAUpdate.FindValueForBlock(UserBB)) + DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V))); + } + // SSAUpdater might have inserted phi-nodes inside other loops. We'll need // to post-process them to keep LCSSA form. for (PHINode *InsertedPN : InsertedPHIs) { diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index 4a1fd8d571aa1..9e5fb0e7172d4 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -64,6 +64,11 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { return getAvailableVals(AV).count(BB); } +Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const { + AvailableValsTy::iterator AVI = getAvailableVals(AV).find(BB); + return (AVI != getAvailableVals(AV).end()) ? AVI->second : nullptr; +} + void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { assert(ProtoType && "Need to initialize SSAUpdater"); assert(ProtoType == V->getType() && diff --git a/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll new file mode 100644 index 0000000000000..231e716cb6d6d --- /dev/null +++ b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -lcssa < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Reproducer for PR39019. +; +; Verify that the llvm.dbg.value in the %for.cond.cleanup2 block is rewritten +; to use the PHI node for %add that is created by LCSSA. + +; CHECK-LABEL: for.cond.cleanup2: +; CHECK-NEXT: [[PN:%[^ ]*]] = phi i32 [ %add.lcssa, %for.cond.cleanup1 ] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR:![0-9]+]], metadata !DIExpression()) +; CHECK-NEXT: call void @bar(i32 [[PN]]) + +; CHECK-LABEL: for.body: +; CHECK: %add = add nsw i32 0, 2 +; CHECK: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR]], metadata !DIExpression()) + +; CHECK: [[VAR]] = !DILocalVariable(name: "sum", + +; Function Attrs: nounwind +define void @foo() #0 !dbg !6 { +entry: + br label %for.cond.preheader, !dbg !12 + +for.cond.preheader: ; preds = %for.cond.cleanup1, %entry + br label %for.body, !dbg !12 + +for.cond.cleanup2: ; preds = %for.cond.cleanup1 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12 + tail call void @bar(i32 %add) #0, !dbg !12 + ret void, !dbg !12 + +for.cond.cleanup1: ; preds = %for.body + br i1 false, label %for.cond.preheader, label %for.cond.cleanup2, !dbg !12 + +for.body: ; preds = %for.body, %for.cond.preheader + %add = add nsw i32 0, 2, !dbg !12 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12 + br i1 false, label %for.body, label %for.cond.cleanup1, !dbg !12 +} + +; Function Attrs: nounwind +declare void @bar(i32) #0 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2) +!1 = !DIFile(filename: "foo.c", directory: "/") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!"clang version 8.0.0"} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 10, type: !7, isLocal: false, isDefinition: true, scopeLine: 10, isOptimized: true, unit: !0, retainedNodes: !8) +!7 = !DISubroutineType(types: !2) +!8 = !{!9} +!9 = !DILocalVariable(name: "sum", scope: !10, file: !1, line: 11, type: !11) +!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !DILocation(line: 0, scope: !10) From 3315ebed51e7e5e3fc01578455242e9d5f4fdb11 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 6 Dec 2018 03:14:29 +0000 Subject: [PATCH 15/21] Merging r343369: ------------------------------------------------------------------------ r343369 | vitalybuka | 2018-09-28 19:17:12 -0700 (Fri, 28 Sep 2018) | 1 line [cxx2a] Fix warning triggered by r343285 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348450 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/DebugInfo/PDB/Native/GlobalsStream.h | 2 -- include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h | 2 +- include/llvm/ExecutionEngine/Orc/Core.h | 2 +- include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h | 3 +-- include/llvm/ProfileData/Coverage/CoverageMapping.h | 2 -- lib/Analysis/MemorySSA.cpp | 1 - lib/Target/AMDGPU/AMDGPULibFunc.cpp | 1 - lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 6 +++--- 8 files changed, 6 insertions(+), 13 deletions(-) diff --git a/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h b/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h index fdc58dc60f7e0..dd04b5c5681d1 100644 --- a/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h +++ b/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h @@ -30,8 +30,6 @@ class GSIHashIterator GSIHashIterator, FixedStreamArrayIterator, std::random_access_iterator_tag, const uint32_t> { public: - GSIHashIterator() = default; - template GSIHashIterator(T &&v) : GSIHashIterator::iterator_adaptor_base(std::forward(v)) {} diff --git a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h index 6602264d1b747..efc25e0559b9d 100644 --- a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h +++ b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h @@ -49,7 +49,7 @@ class ModuleDebugStreamRef { BinarySubstreamRef getC13LinesSubstream() const; BinarySubstreamRef getGlobalRefsSubstream() const; - ModuleDebugStreamRef &operator=(ModuleDebugStreamRef &&Other) = default; + ModuleDebugStreamRef &operator=(ModuleDebugStreamRef &&Other) = delete; iterator_range subsections() const; codeview::DebugSubsectionArray getSubsectionsArray() const { diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h index fd03687cfc21f..11d7c091947e8 100644 --- a/include/llvm/ExecutionEngine/Orc/Core.h +++ b/include/llvm/ExecutionEngine/Orc/Core.h @@ -126,7 +126,7 @@ class MaterializationResponsibility { public: MaterializationResponsibility(MaterializationResponsibility &&) = default; MaterializationResponsibility & - operator=(MaterializationResponsibility &&) = default; + operator=(MaterializationResponsibility &&) = delete; /// Destruct a MaterializationResponsibility instance. In debug mode /// this asserts that all symbols being tracked have been either diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h index 739e5ba47c121..45f95f63e70f8 100644 --- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h +++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h @@ -70,8 +70,7 @@ class OrcRemoteTargetClient RemoteRTDyldMemoryManager & operator=(const RemoteRTDyldMemoryManager &) = delete; RemoteRTDyldMemoryManager(RemoteRTDyldMemoryManager &&) = default; - RemoteRTDyldMemoryManager & - operator=(RemoteRTDyldMemoryManager &&) = default; + RemoteRTDyldMemoryManager &operator=(RemoteRTDyldMemoryManager &&) = delete; uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, diff --git a/include/llvm/ProfileData/Coverage/CoverageMapping.h b/include/llvm/ProfileData/Coverage/CoverageMapping.h index ecb284d30de01..e820f71cb6d55 100644 --- a/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -641,8 +641,6 @@ class LineCoverageIterator this->operator++(); } - LineCoverageIterator &operator=(const LineCoverageIterator &R) = default; - bool operator==(const LineCoverageIterator &R) const { return &CD == &R.CD && Next == R.Next && Ended == R.Ended; } diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index b38c0c4f14392..6e49a39926a2f 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -119,7 +119,6 @@ class MemoryLocOrCall { public: bool IsCall = false; - MemoryLocOrCall() = default; MemoryLocOrCall(MemoryUseOrDef *MUD) : MemoryLocOrCall(MUD->getMemoryInst()) {} MemoryLocOrCall(const MemoryUseOrDef *MUD) diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 4671273d61f91..f37795e961e80 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -90,7 +90,6 @@ class UnmangledFuncInfo { public: using ID = AMDGPULibFunc::EFuncId; - UnmangledFuncInfo() = default; UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs) : Name(_Name), NumArgs(_NumArgs) {} // Get index to Table by function name. diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index bb0e4379d1a8f..f03fcc9c4e2c7 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -231,17 +231,17 @@ struct TransformedFunction { TransformedFunction& operator=(TransformedFunction&&) = default; /// Type of the function before the transformation. - FunctionType* const OriginalType; + FunctionType *OriginalType; /// Type of the function after the transformation. - FunctionType* const TransformedType; + FunctionType *TransformedType; /// Transforming a function may change the position of arguments. This /// member records the mapping from each argument's old position to its new /// position. Argument positions are zero-indexed. If the transformation /// from F to F' made the first argument of F into the third argument of F', /// then ArgumentIndexMapping[0] will equal 2. - const std::vector ArgumentIndexMapping; + std::vector ArgumentIndexMapping; }; /// Given function attributes from a call site for the original function, From a3ee448a5f62038da9424572344068f2bbcb3e1e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 6 Dec 2018 21:36:42 +0000 Subject: [PATCH 16/21] Merging r348181: ------------------------------------------------------------------------ r348181 | lebedevri | 2018-12-03 12:07:58 -0800 (Mon, 03 Dec 2018) | 8 lines [InstCombine] foldICmpWithLowBitMaskedVal(): disable 2 faulty folds. These two folds are invalid for this non-constant pattern when the mask ends up being all-ones: https://rise4fun.com/Alive/9au https://rise4fun.com/Alive/UcQM Fixes https://bugs.llvm.org/show_bug.cgi?id=39861 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348528 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/InstCombine/InstCombineCompares.cpp | 4 ++++ ...e-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll | 10 ++++++---- ...e-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll | 10 ++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 6de92a4842ab3..73ade5d075159 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2924,12 +2924,16 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, // x & (-1 >> y) s>= x -> x s<= (-1 >> y) if (X != I.getOperand(1)) // X must be on RHS of comparison! return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SLE; break; case ICmpInst::Predicate::ICMP_SLT: // x & (-1 >> y) s< x -> x s> (-1 >> y) if (X != I.getOperand(1)) // X must be on RHS of comparison! return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SGT; break; case ICmpInst::Predicate::ICMP_SLE: diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll index 7be784a452fd1..502b015b12235 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll @@ -26,8 +26,9 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp sge i8 [[TMP0]], [[X:%.*]] -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y %tmp1 = and i8 %tmp0, %x @@ -120,8 +121,9 @@ define i1 @cv0(i8 %y) { ; CHECK-LABEL: @cv0( ; CHECK-NEXT: [[X:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp sle i8 [[X]], [[TMP0]] -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], [[TMP0]] +; CHECK-NEXT: [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] ; %x = call i8 @gen8() %tmp0 = lshr i8 -1, %y diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll index d1792d1e075b8..7daa043e801fe 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll @@ -26,8 +26,9 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[TMP0]], [[X:%.*]] -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y %tmp1 = and i8 %tmp0, %x @@ -120,8 +121,9 @@ define i1 @cv0(i8 %y) { ; CHECK-LABEL: @cv0( ; CHECK-NEXT: [[X:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i8 [[X]], [[TMP0]] -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], [[TMP0]] +; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] ; %x = call i8 @gen8() %tmp0 = lshr i8 -1, %y From fa099fd85c19d67353a4167fe57e98853e572fae Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 6 Dec 2018 22:20:44 +0000 Subject: [PATCH 17/21] Merging r348461: ------------------------------------------------------------------------ r348461 | lebedevri | 2018-12-06 00:11:20 -0800 (Thu, 06 Dec 2018) | 4 lines [NFC][InstCombine] Add more miscompile tests for foldICmpWithLowBitMaskedVal() We also have to me aware of vector constants. If at least one element is -1, we can't transform. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348535 91177308-0d34-0410-b5e6-96231b3b80d8 --- ...t-low-bit-mask-and-icmp-sge-to-icmp-sle.ll | 50 ++++++++++++++----- ...t-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll | 50 ++++++++++++++----- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll index 502b015b12235..36994b3f86b05 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll @@ -23,19 +23,6 @@ define i1 @p0(i8 %x) { ret i1 %ret } -define i1 @pv(i8 %x, i8 %y) { -; CHECK-LABEL: @pv( -; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]] -; CHECK-NEXT: ret i1 [[RET]] -; - %tmp0 = lshr i8 -1, %y - %tmp1 = and i8 %tmp0, %x - %ret = icmp sge i8 %tmp1, %x - ret i1 %ret -} - ; ============================================================================ ; ; Vector tests ; ============================================================================ ; @@ -198,3 +185,40 @@ define <2 x i1> @n2(<2 x i8> %x) { %ret = icmp sge <2 x i8> %tmp0, %x ret <2 x i1> %ret } + +; ============================================================================ ; +; Potential miscompiles. +; ============================================================================ ; + +define i1 @nv(i8 %x, i8 %y) { +; CHECK-LABEL: @nv( +; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp sge i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] +; + %tmp0 = lshr i8 -1, %y + %tmp1 = and i8 %tmp0, %x + %ret = icmp sge i8 %tmp1, %x + ret i1 %ret +} + +define <2 x i1> @n3_vec(<2 x i8> %x) { +; CHECK-LABEL: @n3_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; + %tmp0 = and <2 x i8> %x, + %ret = icmp sge <2 x i8> %tmp0, %x + ret <2 x i1> %ret +} + +define <3 x i1> @n4_vec(<3 x i8> %x) { +; CHECK-LABEL: @n4_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i8> [[X:%.*]], +; CHECK-NEXT: ret <3 x i1> [[TMP1]] +; + %tmp0 = and <3 x i8> %x, + %ret = icmp sge <3 x i8> %tmp0, %x + ret <3 x i1> %ret +} diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll index 7daa043e801fe..281f9e8f34bc2 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll @@ -23,19 +23,6 @@ define i1 @p0(i8 %x) { ret i1 %ret } -define i1 @pv(i8 %x, i8 %y) { -; CHECK-LABEL: @pv( -; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]] -; CHECK-NEXT: ret i1 [[RET]] -; - %tmp0 = lshr i8 -1, %y - %tmp1 = and i8 %tmp0, %x - %ret = icmp slt i8 %tmp1, %x - ret i1 %ret -} - ; ============================================================================ ; ; Vector tests ; ============================================================================ ; @@ -198,3 +185,40 @@ define <2 x i1> @n2(<2 x i8> %x) { %ret = icmp slt <2 x i8> %tmp0, %x ret <2 x i1> %ret } + +; ============================================================================ ; +; Potential miscompiles. +; ============================================================================ ; + +define i1 @nv(i8 %x, i8 %y) { +; CHECK-LABEL: @nv( +; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[TMP1]], [[X]] +; CHECK-NEXT: ret i1 [[RET]] +; + %tmp0 = lshr i8 -1, %y + %tmp1 = and i8 %tmp0, %x + %ret = icmp slt i8 %tmp1, %x + ret i1 %ret +} + +define <2 x i1> @n3(<2 x i8> %x) { +; CHECK-LABEL: @n3( +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; + %tmp0 = and <2 x i8> %x, + %ret = icmp slt <2 x i8> %tmp0, %x + ret <2 x i1> %ret +} + +define <3 x i1> @n4(<3 x i8> %x) { +; CHECK-LABEL: @n4( +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <3 x i8> [[X:%.*]], +; CHECK-NEXT: ret <3 x i1> [[TMP1]] +; + %tmp0 = and <3 x i8> %x, + %ret = icmp slt <3 x i8> %tmp0, %x + ret <3 x i1> %ret +} From cc39aee52886ad79b637af4311b1f3fe5e34953e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 6 Dec 2018 22:36:26 +0000 Subject: [PATCH 18/21] Merging r348462: ------------------------------------------------------------------------ r348462 | lebedevri | 2018-12-06 00:14:24 -0800 (Thu, 06 Dec 2018) | 13 lines [InstCombine] foldICmpWithLowBitMaskedVal(): don't miscompile -1 vector elts I was finally able to quantify what i thought was missing in the fix, it was vector constants. If we have a scalar (and %x, -1), it will be instsimplified before we reach this code, but if it is a vector, we may still have a -1 element. Thus, we want to avoid the fold if *at least one* element is -1. Or in other words, ignoring the undef elements, no sign bits should be set. Thus, m_NonNegative(). A follow-up for rL348181 https://bugs.llvm.org/show_bug.cgi?id=39861 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348538 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/InstCombine/InstCombineCompares.cpp | 4 ++++ ...e-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll | 10 ++++++---- ...e-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll | 10 ++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 73ade5d075159..e1bae11b40d10 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2926,6 +2926,8 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, return nullptr; // Ignore the other case. if (!match(M, m_Constant())) // Can not do this fold with non-constant. return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SLE; break; case ICmpInst::Predicate::ICMP_SLT: @@ -2934,6 +2936,8 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, return nullptr; // Ignore the other case. if (!match(M, m_Constant())) // Can not do this fold with non-constant. return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; DstPred = ICmpInst::Predicate::ICMP_SGT; break; case ICmpInst::Predicate::ICMP_SLE: diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll index 36994b3f86b05..ca1b86c0623ab 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll @@ -205,8 +205,9 @@ define i1 @nv(i8 %x, i8 %y) { define <2 x i1> @n3_vec(<2 x i8> %x) { ; CHECK-LABEL: @n3_vec( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> [[X:%.*]], -; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp sge <2 x i8> [[TMP0]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[RET]] ; %tmp0 = and <2 x i8> %x, %ret = icmp sge <2 x i8> %tmp0, %x @@ -215,8 +216,9 @@ define <2 x i1> @n3_vec(<2 x i8> %x) { define <3 x i1> @n4_vec(<3 x i8> %x) { ; CHECK-LABEL: @n4_vec( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i8> [[X:%.*]], -; CHECK-NEXT: ret <3 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp sge <3 x i8> [[TMP0]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[RET]] ; %tmp0 = and <3 x i8> %x, %ret = icmp sge <3 x i8> %tmp0, %x diff --git a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll index 281f9e8f34bc2..2957ad5731c7d 100644 --- a/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll +++ b/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll @@ -205,8 +205,9 @@ define i1 @nv(i8 %x, i8 %y) { define <2 x i1> @n3(<2 x i8> %x) { ; CHECK-LABEL: @n3( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i8> [[X:%.*]], -; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = and <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp slt <2 x i8> [[TMP0]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[RET]] ; %tmp0 = and <2 x i8> %x, %ret = icmp slt <2 x i8> %tmp0, %x @@ -215,8 +216,9 @@ define <2 x i1> @n3(<2 x i8> %x) { define <3 x i1> @n4(<3 x i8> %x) { ; CHECK-LABEL: @n4( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <3 x i8> [[X:%.*]], -; CHECK-NEXT: ret <3 x i1> [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp slt <3 x i8> [[TMP0]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[RET]] ; %tmp0 = and <3 x i8> %x, %ret = icmp slt <3 x i8> %tmp0, %x From 06a6c3307840bdc677efb19ce8a87e1ead4c2c5d Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 7 Dec 2018 00:24:01 +0000 Subject: [PATCH 19/21] Merging r340125: ------------------------------------------------------------------------ r340125 | lhames | 2018-08-18 11:38:37 -0700 (Sat, 18 Aug 2018) | 6 lines [RuntimeDyld] Fix a bug in RuntimeDyld::loadObjectImpl that was over-allocating space for common symbols. Patch by Dmitry Sidorov. Thanks Dmitry! Differential revision: https://reviews.llvm.org/D50240 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348555 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 1189be599eddc..76f5e5ead5046 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -275,7 +275,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { uint64_t Size = I->getCommonSize(); if (!CommonAlign) CommonAlign = Align; - CommonSize += alignTo(CommonSize, Align) + Size; + CommonSize = alignTo(CommonSize, Align) + Size; CommonSymbolsToAllocate.push_back(*I); } } else From 4a684f7170a0907f83398cf0cb21a24ce7eda0c9 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 7 Dec 2018 20:21:24 +0000 Subject: [PATCH 20/21] Merging r346203: ------------------------------------------------------------------------ r346203 | matze | 2018-11-05 19:15:22 -0800 (Mon, 05 Nov 2018) | 7 lines AArch64: Cleanup CCMP code; NFC Cleanup CCMP pattern matching code in preparation for review/bugfix: - Rename `isConjunctionDisjunctionTree()` to `canEmitConjunction()` (it won't accept arbitrary disjunctions and is really about whether we can transform the subtree into a conjunction that we can emit). - Rename `emitConjunctionDisjunctionTree()` to `emitConjunction()` ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348636 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 59 +++++++++++----------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index de762a7bb1d4d..dcc5957494b65 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1515,7 +1515,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of /// a comparison. They set the NZCV flags to a predefined value if their /// predicate is false. This allows to express arbitrary conjunctions, for -/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" /// expressed as: /// cmp A /// ccmp B, inv(CB), CA @@ -1585,14 +1585,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } -/// Returns true if @p Val is a tree of AND/OR/SETCC operations. -/// CanPushNegate is set to true if we can push a negate operation through -/// the tree in a was that we are left with AND operations and negate operations -/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to -/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be -/// brought into such a form. -static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, - unsigned Depth = 0) { +/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be +/// expressed as a conjunction. See \ref AArch64CCMP. +/// \param CanNegate Set to true if we can also emit the negation of the +/// tree as a conjunction. +static bool canEmitConjunction(const SDValue Val, bool &CanNegate, + unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); @@ -1609,10 +1607,10 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; - if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) + if (!canEmitConjunction(O0, CanNegateL, Depth+1)) return false; bool CanNegateR; - if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) + if (!canEmitConjunction(O1, CanNegateR, Depth+1)) return false; if (Opcode == ISD::OR) { @@ -1620,8 +1618,11 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, // we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; - // We can however change a (not (or x y)) to (and (not x) (not y)) if we - // can negate the x and y subtrees. + // However if we can negate x and y, then we can change + // (not (or x y)) + // into + // (and (not x) (not y)) + // to eliminate the outer negation. CanNegate = CanNegateL && CanNegateR; } else { // If the operands are OR expressions then we finally need to negate their @@ -1631,7 +1632,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, bool NeedsNegOutR = O1->getOpcode() == ISD::OR; if (NeedsNegOutL && NeedsNegOutR) return false; - // We cannot negate an AND operation (it would become an OR), + // We cannot negate an AND operation. CanNegate = false; } return true; @@ -1649,7 +1650,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate /// for the comparisons in the current subtree; @p Depth limits the search /// depth to avoid stack overflow. -static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, +static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. @@ -1706,13 +1707,13 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, if (NegateOpsAndResult) { // See which side we can negate. bool CanNegateL; - bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); + bool isValidL = canEmitConjunction(LHS, CanNegateL); assert(isValidL && "Valid conjunction/disjunction tree"); (void)isValidL; #ifndef NDEBUG bool CanNegateR; - bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); + bool isValidR = canEmitConjunction(RHS, CanNegateR); assert(isValidR && "Valid conjunction/disjunction tree"); assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); #endif @@ -1734,12 +1735,12 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, // through if we are already in a PushNegate case, otherwise we can negate // the "flags to test" afterwards. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, + SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate, CCOp, Predicate); if (NegateOpsAndResult && !Negate) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); // Emit LHS. We may need to negate it. - SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, + SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateOpsAndResult, CmpR, RHSCC); // If we transformed an OR to and AND then we have to negate the result @@ -1749,17 +1750,17 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, return CmpL; } -/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain -/// of CCMP/CFCMP ops. See @ref AArch64CCMP. -/// \see emitConjunctionDisjunctionTreeRec(). -static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, - AArch64CC::CondCode &OutCC) { - bool CanNegate; - if (!isConjunctionDisjunctionTree(Val, CanNegate)) +/// Emit expression as a conjunction (a series of CCMP/CFCMP ops). +/// In some cases this is even possible with OR operations in the expression. +/// See \ref AArch64CCMP. +/// \see emitConjunctionRec(). +static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC) { + bool DummyCanNegate; + if (!canEmitConjunction(Val, DummyCanNegate)) return SDValue(); - return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), - AArch64CC::AL); + return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); } /// @} @@ -1859,7 +1860,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { - if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } From cd98f42d0747826062fc3d2d2fad383aedf58dd6 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 7 Dec 2018 20:42:27 +0000 Subject: [PATCH 21/21] Merging r348444: ------------------------------------------------------------------------ r348444 | matze | 2018-12-05 17:40:23 -0800 (Wed, 05 Dec 2018) | 15 lines AArch64: Fix invalid CCMP emission The code emitting AND-subtrees used to check whether any of the operands was an OR in order to figure out if the result needs to be negated. However the OR could be hidden in further subtrees and not immediately visible. Change the code so that canEmitConjunction() determines whether the result of the generated subtree needs to be negated. Cleanup emission logic to use this. I also changed the code a bit to make all negation decisions early before we actually emit the subtrees. This fixes http://llvm.org/PR39550 Differential Revision: https://reviews.llvm.org/D54137 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_70@348642 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 217 ++++++++++++--------- test/CodeGen/AArch64/arm64-ccmp.ll | 72 ++++++- 2 files changed, 192 insertions(+), 97 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index dcc5957494b65..cfc7aa96d31f7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1521,33 +1521,44 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// ccmp B, inv(CB), CA /// check for CB flags /// -/// In general we can create code for arbitrary "... (and (and A B) C)" -/// sequences. We can also implement some "or" expressions, because "(or A B)" -/// is equivalent to "not (and (not A) (not B))" and we can implement some -/// negation operations: -/// We can negate the results of a single comparison by inverting the flags -/// used when the predicate fails and inverting the flags tested in the next -/// instruction; We can also negate the results of the whole previous -/// conditional compare sequence by inverting the flags tested in the next -/// instruction. However there is no way to negate the result of a partial -/// sequence. +/// This naturally lets us implement chains of AND operations with SETCC +/// operands. And we can even implement some other situations by transforming +/// them: +/// - We can implement (NEG SETCC) i.e. negating a single comparison by +/// negating the flags used in a CCMP/FCCMP operations. +/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations +/// by negating the flags we test for afterwards. i.e. +/// NEG (CMP CCMP CCCMP ...) can be implemented. +/// - Note that we can only ever negate all previously processed results. +/// What we can not implement by flipping the flags to test is a negation +/// of two sub-trees (because the negation affects all sub-trees emitted so +/// far, so the 2nd sub-tree we emit would also affect the first). +/// With those tools we can implement some OR operations: +/// - (OR (SETCC A) (SETCC B)) can be implemented via: +/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) +/// - After transforming OR to NEG/AND combinations we may be able to use NEG +/// elimination rules from earlier to implement the whole thing as a +/// CCMP/FCCMP chain. /// -/// Therefore on encountering an "or" expression we can negate the subtree on -/// one side and have to be able to push the negate to the leafs of the subtree -/// on the other side (see also the comments in code). As complete example: -/// "or (or (setCA (cmp A)) (setCB (cmp B))) -/// (and (setCC (cmp C)) (setCD (cmp D)))" -/// is transformed to -/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) -/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" -/// and implemented as: +/// As complete example: +/// or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// can be reassociated to: +/// or (and (setCC (cmp C)) setCD (cmp D)) +// (or (setCA (cmp A)) (setCB (cmp B))) +/// can be transformed to: +/// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// which can be implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags -/// A counterexample is "or (and A B) (and C D)" which cannot be implemented -/// by conditional compare sequences. +/// +/// A counterexample is "or (and A B) (and C D)" which translates to +/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we +/// can only implement 1 of the inner (not) operations, but not both! /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. @@ -1587,9 +1598,20 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be /// expressed as a conjunction. See \ref AArch64CCMP. -/// \param CanNegate Set to true if we can also emit the negation of the -/// tree as a conjunction. +/// \param CanNegate Set to true if we can negate the whole sub-tree just by +/// changing the conditions on the SETCC tests. +/// (this means we can call emitConjunctionRec() with +/// Negate==true on this sub-tree) +/// \param MustBeFirst Set to true if this subtree needs to be negated and we +/// cannot do the negation naturally. We are required to +/// emit the subtree first in this case. +/// \param WillNegate Is true if are called when the result of this +/// subexpression must be negated. This happens when the +/// outer expression is an OR. We can use this fact to know +/// that we have a double negation (or (or ...) ...) that +/// can be implemented for free. static bool canEmitConjunction(const SDValue Val, bool &CanNegate, + bool &MustBeFirst, bool WillNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; @@ -1598,42 +1620,44 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate, if (Val->getOperand(0).getValueType() == MVT::f128) return false; CanNegate = true; + MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { + bool IsOR = Opcode == ISD::OR; SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; - if (!canEmitConjunction(O0, CanNegateL, Depth+1)) + bool MustBeFirstL; + if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) return false; bool CanNegateR; - if (!canEmitConjunction(O1, CanNegateR, Depth+1)) + bool MustBeFirstR; + if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) + return false; + + if (MustBeFirstL && MustBeFirstR) return false; - if (Opcode == ISD::OR) { - // For an OR expression we need to be able to negate at least one side or - // we cannot do the transformation at all. + if (IsOR) { + // For an OR expression we need to be able to naturally negate at least + // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; - // However if we can negate x and y, then we can change - // (not (or x y)) - // into - // (and (not x) (not y)) - // to eliminate the outer negation. - CanNegate = CanNegateL && CanNegateR; + // If we the result of the OR will be negated and we can naturally negate + // the leafs, then this sub-tree as a whole negates naturally. + CanNegate = WillNegate && CanNegateL && CanNegateR; + // If we cannot naturally negate the whole sub-tree, then this must be + // emitted first. + MustBeFirst = !CanNegate; } else { - // If the operands are OR expressions then we finally need to negate their - // outputs, we can only do that for the operand with emitted last by - // negating OutCC, not for both operands. - bool NeedsNegOutL = O0->getOpcode() == ISD::OR; - bool NeedsNegOutR = O1->getOpcode() == ISD::OR; - if (NeedsNegOutL && NeedsNegOutR) - return false; - // We cannot negate an AND operation. + assert(Opcode == ISD::AND && "Must be OR or AND"); + // We cannot naturally negate an AND operation. CanNegate = false; + MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } @@ -1646,10 +1670,8 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate, /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. -/// On recursive invocations @p PushNegate may be set to true to have negation -/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate -/// for the comparisons in the current subtree; @p Depth limits the search -/// depth to avoid stack overflow. +/// \p Negate is true if we want this sub-tree being negated just by changing +/// SETCC conditions. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { @@ -1691,61 +1713,69 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); } - assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && - "Valid conjunction/disjunction tree"); + assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); - // Check if both sides can be transformed. - SDValue LHS = Val->getOperand(0); - SDValue RHS = Val->getOperand(1); - - // In case of an OR we need to negate our operands and the result. - // (A v B) <=> not(not(A) ^ not(B)) - bool NegateOpsAndResult = Opcode == ISD::OR; - // We can negate the results of all previous operations by inverting the - // predicate flags giving us a free negation for one side. The other side - // must be negatable by itself. - if (NegateOpsAndResult) { - // See which side we can negate. - bool CanNegateL; - bool isValidL = canEmitConjunction(LHS, CanNegateL); - assert(isValidL && "Valid conjunction/disjunction tree"); - (void)isValidL; + bool IsOR = Opcode == ISD::OR; -#ifndef NDEBUG - bool CanNegateR; - bool isValidR = canEmitConjunction(RHS, CanNegateR); - assert(isValidR && "Valid conjunction/disjunction tree"); - assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); -#endif + SDValue LHS = Val->getOperand(0); + bool CanNegateL; + bool MustBeFirstL; + bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); + assert(ValidL && "Valid conjunction/disjunction tree"); + (void)ValidL; - // Order the side which we cannot negate to RHS so we can emit it first. - if (!CanNegateL) + SDValue RHS = Val->getOperand(1); + bool CanNegateR; + bool MustBeFirstR; + bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); + assert(ValidR && "Valid conjunction/disjunction tree"); + (void)ValidR; + + // Swap sub-tree that must come first to the right side. + if (MustBeFirstL) { + assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); + std::swap(LHS, RHS); + std::swap(CanNegateL, CanNegateR); + std::swap(MustBeFirstL, MustBeFirstR); + } + + bool NegateR; + bool NegateAfterR; + bool NegateL; + bool NegateAfterAll; + if (Opcode == ISD::OR) { + // Swap the sub-tree that we can negate naturally to the left. + if (!CanNegateL) { + assert(CanNegateR && "at least one side must be negatable"); + assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); + assert(!Negate); std::swap(LHS, RHS); + NegateR = false; + NegateAfterR = true; + } else { + // Negate the left sub-tree if possible, otherwise negate the result. + NegateR = CanNegateR; + NegateAfterR = !CanNegateR; + } + NegateL = true; + NegateAfterAll = !Negate; } else { - bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; - assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && - "Valid conjunction/disjunction tree"); - // Order the side where we need to negate the output flags to RHS so it - // gets emitted first. - if (NeedsNegOutL) - std::swap(LHS, RHS); + assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); + assert(!Negate && "Valid conjunction/disjunction tree"); + + NegateL = false; + NegateR = false; + NegateAfterR = false; + NegateAfterAll = false; } - // Emit RHS. If we want to negate the tree we only need to push a negate - // through if we are already in a PushNegate case, otherwise we can negate - // the "flags to test" afterwards. + // Emit sub-trees. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate, - CCOp, Predicate); - if (NegateOpsAndResult && !Negate) + SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); + if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); - // Emit LHS. We may need to negate it. - SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, - NegateOpsAndResult, CmpR, - RHSCC); - // If we transformed an OR to and AND then we have to negate the result - // (or absorb the Negate parameter). - if (NegateOpsAndResult && !Negate) + SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); + if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } @@ -1757,7 +1787,8 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC) { bool DummyCanNegate; - if (!canEmitConjunction(Val, DummyCanNegate)) + bool DummyMustBeFirst; + if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) return SDValue(); return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index b18e638a3a947..6b497e8f7bfda 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -526,8 +526,8 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3 ; CHECK-LABEL: select_or_one_olt: ; CHECK-LABEL: ; %bb.0: ; CHECK-NEXT: fcmp d0, d1 -; CHECK-NEXT: fccmp d0, d1, #1, ne -; CHECK-NEXT: fccmp d2, d3, #8, vs +; CHECK-NEXT: fccmp d0, d1, #8, le +; CHECK-NEXT: fccmp d2, d3, #8, pl ; CHECK-NEXT: csel w0, w0, w1, mi ; CHECK-NEXT: ret define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 { @@ -556,8 +556,8 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3 ; CHECK-LABEL: select_or_ueq_olt: ; CHECK-LABEL: ; %bb.0: ; CHECK-NEXT: fcmp d0, d1 -; CHECK-NEXT: fccmp d0, d1, #8, le -; CHECK-NEXT: fccmp d2, d3, #8, mi +; CHECK-NEXT: fccmp d0, d1, #1, ne +; CHECK-NEXT: fccmp d2, d3, #8, vc ; CHECK-NEXT: csel w0, w0, w1, mi ; CHECK-NEXT: ret define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 { @@ -656,4 +656,68 @@ define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, ret i32 %sel } +; This testcase resembles the core problem of http://llvm.org/PR39550 +; (an OR operation is 2 levels deep but needs to be implemented first) +; CHECK-LABEL: deep_or +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w1, #0, #4, eq +; CHECK-NEXT: ccmp w0, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %or, %c1 + %and1 = and i1 %and0, %c0 + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + +; Variation of deep_or, we still need to implement the OR first though. +; CHECK-LABEL: deep_or1 +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w0, #0, #4, eq +; CHECK-NEXT: ccmp w1, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %c0, %or + %and1 = and i1 %and0, %c1 + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + +; Variation of deep_or, we still need to implement the OR first though. +; CHECK-LABEL: deep_or2 +; CHECK: cmp w2, #20 +; CHECK-NEXT: ccmp w2, #15, #4, ne +; CHECK-NEXT: ccmp w1, #0, #4, eq +; CHECK-NEXT: ccmp w0, #0, #4, ne +; CHECK-NEXT: csel w0, w4, w5, ne +; CHECK-NEXT: ret +define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) { + %c0 = icmp ne i32 %a0, 0 + %c1 = icmp ne i32 %a1, 0 + %c2 = icmp eq i32 %a2, 15 + %c3 = icmp eq i32 %a2, 20 + + %or = or i1 %c2, %c3 + %and0 = and i1 %c0, %c1 + %and1 = and i1 %and0, %or + %sel = select i1 %and1, i32 %x, i32 %y + ret i32 %sel +} + attributes #0 = { nounwind }