diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index caa538082fc74..42597c871eae8 100644 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -127,45 +127,55 @@ if(HAVE_LIBPTHREAD) set(LLVM_PTHREAD_LIB ${CMAKE_THREAD_LIBS_INIT}) endif() -# Don't look for these libraries on Windows. Also don't look for them if we're -# using MSan, since uninstrumented third party code may call MSan interceptors -# like strlen, leading to false positives. -if( NOT PURE_WINDOWS AND NOT LLVM_USE_SANITIZER MATCHES "Memory.*") - if (LLVM_ENABLE_ZLIB) - check_library_exists(z compress2 "" HAVE_LIBZ) - else() - set(HAVE_LIBZ 0) - endif() - # Skip libedit if using ASan as it contains memory leaks. - if (LLVM_ENABLE_LIBEDIT AND HAVE_HISTEDIT_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*") - check_library_exists(edit el_init "" HAVE_LIBEDIT) - else() - set(HAVE_LIBEDIT 0) - endif() - if(LLVM_ENABLE_TERMINFO) - set(HAVE_TERMINFO 0) - foreach(library tinfo terminfo curses ncurses ncursesw) +# Don't look for these libraries if we're using MSan, since uninstrumented third +# party code may call MSan interceptors like strlen, leading to false positives. +if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*") + set(HAVE_LIBZ 0) + if(LLVM_ENABLE_ZLIB) + foreach(library z zlib_static zlib) string(TOUPPER ${library} library_suffix) - check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix}) - if(HAVE_TERMINFO_${library_suffix}) - set(HAVE_TERMINFO 1) - set(TERMINFO_LIBS "${library}") + check_library_exists(${library} compress2 "" HAVE_LIBZ_${library_suffix}) + if(HAVE_LIBZ_${library_suffix}) + set(HAVE_LIBZ 1) + set(ZLIB_LIBRARIES "${library}") break() endif() endforeach() - else() - set(HAVE_TERMINFO 0) endif() - find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c) - set(LLVM_LIBXML2_ENABLED 0) - set(LIBXML2_FOUND 0) - if((LLVM_ENABLE_LIBXML2) AND ((CMAKE_SYSTEM_NAME MATCHES "Linux") AND (ICONV_LIBRARY_PATH) OR APPLE)) - find_package(LibXml2) - if (LIBXML2_FOUND) - set(LLVM_LIBXML2_ENABLED 1) - include_directories(${LIBXML2_INCLUDE_DIR}) - set(LIBXML2_LIBS "xml2") + # Don't look for these libraries on Windows. + if (NOT PURE_WINDOWS) + # Skip libedit if using ASan as it contains memory leaks. + if (LLVM_ENABLE_LIBEDIT AND HAVE_HISTEDIT_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*") + check_library_exists(edit el_init "" HAVE_LIBEDIT) + else() + set(HAVE_LIBEDIT 0) + endif() + if(LLVM_ENABLE_TERMINFO) + set(HAVE_TERMINFO 0) + foreach(library tinfo terminfo curses ncurses ncursesw) + string(TOUPPER ${library} library_suffix) + check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix}) + if(HAVE_TERMINFO_${library_suffix}) + set(HAVE_TERMINFO 1) + set(TERMINFO_LIBS "${library}") + break() + endif() + endforeach() + else() + set(HAVE_TERMINFO 0) + endif() + + find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c) + set(LLVM_LIBXML2_ENABLED 0) + set(LIBXML2_FOUND 0) + if((LLVM_ENABLE_LIBXML2) AND ((CMAKE_SYSTEM_NAME MATCHES "Linux") AND (ICONV_LIBRARY_PATH) OR APPLE)) + find_package(LibXml2) + if (LIBXML2_FOUND) + set(LLVM_LIBXML2_ENABLED 1) + include_directories(${LIBXML2_INCLUDE_DIR}) + set(LIBXML2_LIBS "xml2") + endif() endif() endif() endif() diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index 26da2d13eed20..90cba50d94c5e 100644 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -768,7 +768,7 @@ macro(add_llvm_executable name) # libpthreads overrides some standard library symbols, so main # executable must be linked with it in order to provide consistent # API for all shared libaries loaded by this executable. - target_link_libraries(${name} ${LLVM_PTHREAD_LIB}) + target_link_libraries(${name} PRIVATE ${LLVM_PTHREAD_LIB}) endif() endmacro(add_llvm_executable name) @@ -1093,7 +1093,7 @@ function(add_unittest test_suite test_name) # libpthreads overrides some standard library symbols, so main # executable must be linked with it in order to provide consistent # API for all shared libaries loaded by this executable. - target_link_libraries(${test_name} gtest_main gtest ${LLVM_PTHREAD_LIB}) + target_link_libraries(${test_name} PRIVATE gtest_main gtest ${LLVM_PTHREAD_LIB}) add_dependencies(${test_suite} ${test_name}) get_target_property(test_suite_folder ${test_suite} FOLDER) diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake index 2b9ab23c47704..10fd52609274e 100644 --- a/cmake/modules/LLVM-Config.cmake +++ b/cmake/modules/LLVM-Config.cmake @@ -87,7 +87,7 @@ macro(llvm_config executable) endif() endif() - target_link_libraries(${executable} LLVM) + target_link_libraries(${executable} PRIVATE LLVM) endif() explicit_llvm_config(${executable} ${link_components}) @@ -99,9 +99,9 @@ function(explicit_llvm_config executable) llvm_map_components_to_libnames(LIBRARIES ${link_components}) get_target_property(t ${executable} TYPE) - if("x${t}" STREQUAL "xSTATIC_LIBRARY") + if(t STREQUAL "STATIC_LIBRARY") target_link_libraries(${executable} INTERFACE ${LIBRARIES}) - elseif("x${t}" STREQUAL "xSHARED_LIBRARY" OR "x${t}" STREQUAL "xMODULE_LIBRARY") + elseif(t STREQUAL "EXECUTABLE" OR t STREQUAL "SHARED_LIBRARY" OR t STREQUAL "MODULE_LIBRARY") target_link_libraries(${executable} PRIVATE ${LIBRARIES}) else() # Use plain form for legacy user. diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake index 373387c755c8c..709c7c2556cf3 100644 --- a/cmake/modules/LLVMExternalProjectUtils.cmake +++ b/cmake/modules/LLVMExternalProjectUtils.cmake @@ -95,7 +95,7 @@ function(llvm_ExternalProject_Add name source_dir) foreach(prefix ${ARG_PASSTHROUGH_PREFIXES}) foreach(variableName ${variableNames}) if(variableName MATCHES "^${prefix}") - string(REPLACE ";" "," value "${${variableName}}") + string(REPLACE ";" "|" value "${${variableName}}") list(APPEND PASSTHROUGH_VARIABLES -D${variableName}=${value}) endif() @@ -160,7 +160,7 @@ function(llvm_ExternalProject_Add name source_dir) USES_TERMINAL_CONFIGURE 1 USES_TERMINAL_BUILD 1 USES_TERMINAL_INSTALL 1 - LIST_SEPARATOR , + LIST_SEPARATOR | ) if(ARG_USE_TOOLCHAIN) diff --git a/docs/XRay.rst b/docs/XRay.rst index b4443c4d8060a..942b479af749c 100644 --- a/docs/XRay.rst +++ b/docs/XRay.rst @@ -143,15 +143,25 @@ variable, where we list down the options and their defaults below. | | | | instrumentation points | | | | | before main. | +-------------------+-----------------+---------------+------------------------+ -| xray_naive_log | ``bool`` | ``false`` | Whether to install | -| | | | the naive log | -| | | | implementation. | +| xray_mode | ``const char*`` | ``""`` | Default mode to | +| | | | install and initialize | +| | | | before ``main``. | +-------------------+-----------------+---------------+------------------------+ | xray_logfile_base | ``const char*`` | ``xray-log.`` | Filename base for the | | | | | XRay logfile. | +-------------------+-----------------+---------------+------------------------+ -| xray_fdr_log | ``bool`` | ``false`` | Whether to install the | -| | | | Flight Data Recorder | +| xray_naive_log | ``bool`` | ``false`` | **DEPRECATED:** Use | +| | | | xray_mode=xray-basic | +| | | | instead. Whether to | +| | | | install the basic log | +| | | | the naive log | +| | | | implementation. | ++-------------------+-----------------+---------------+------------------------+ +| xray_fdr_log | ``bool`` | ``false`` | **DEPRECATED:** Use | +| | | | xray_mode=xray-fdr | +| | | | instead. Whether to | +| | | | install the Flight | +| | | | Data Recorder | | | | | (FDR) mode. | +-------------------+-----------------+---------------+------------------------+ @@ -241,6 +251,14 @@ following API: - ``__xray_set_log_impl(...)``: This function takes a struct of type ``XRayLogImpl``, which is defined in ``xray/xray_log_interface.h``, part of the XRay compiler-rt installation. +- ``__xray_log_register_mode(...)``: Register a logging implementation against + a string Mode. The implementation is an instance of ``XRayLogImpl`` defined + in ``xray/xray_log_interface.h``. +- ``__xray_log_select_mode(...)``: Select the mode to install, associated with + a string Mode. Only implementations registered with + ``__xray_log_register_mode(...)`` can be chosen with this function. When + successful, has the same effects as calling ``__xray_set_log_impl(...)`` with + the registered logging implementation. - ``__xray_log_init(...)``: This function allows for initializing and re-initializing an installed logging implementation. See ``xray/xray_log_interface.h`` for details, part of the XRay compiler-rt diff --git a/docs/XRayExample.rst b/docs/XRayExample.rst index 953833bc1ef65..9f74442bcebe9 100644 --- a/docs/XRayExample.rst +++ b/docs/XRayExample.rst @@ -60,7 +60,7 @@ to enable XRay at application start. To do this, XRay checks the $ ./bin/llc input.ll # We need to set the XRAY_OPTIONS to enable some features. - $ XRAY_OPTIONS="patch_premain=true xray_naive_log=true" ./bin/llc input.ll + $ XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic" ./bin/llc input.ll ==69819==XRay: Log file in 'xray-log.llc.m35qPB' At this point we now have an XRay trace we can start analysing. diff --git a/examples/ParallelJIT/CMakeLists.txt b/examples/ParallelJIT/CMakeLists.txt index deeee072b33ca..c42dfc85c14a0 100644 --- a/examples/ParallelJIT/CMakeLists.txt +++ b/examples/ParallelJIT/CMakeLists.txt @@ -11,4 +11,4 @@ add_llvm_example(ParallelJIT ParallelJIT.cpp ) -target_link_libraries(ParallelJIT ${LLVM_PTHREAD_LIB}) +target_link_libraries(ParallelJIT PRIVATE ${LLVM_PTHREAD_LIB}) diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h index 41bb03cac07bf..f7a2a6ba3fcb0 100644 --- a/include/llvm/Analysis/AliasAnalysis.h +++ b/include/llvm/Analysis/AliasAnalysis.h @@ -95,19 +95,60 @@ enum AliasResult { /// /// This is no access at all, a modification, a reference, or both /// a modification and a reference. These are specifically structured such that -/// they form a two bit matrix and bit-tests for 'mod' or 'ref' work with any -/// of the possible values. +/// they form a two bit matrix and bit-tests for 'mod' or 'ref' +/// work with any of the possible values. + enum ModRefInfo { /// The access neither references nor modifies the value stored in memory. MRI_NoModRef = 0, - /// The access references the value stored in memory. + /// The access may reference the value stored in memory. MRI_Ref = 1, - /// The access modifies the value stored in memory. + /// The access may modify the value stored in memory. MRI_Mod = 2, - /// The access both references and modifies the value stored in memory. - MRI_ModRef = MRI_Ref | MRI_Mod + /// The access may reference and may modify the value stored in memory. + MRI_ModRef = MRI_Ref | MRI_Mod, }; +LLVM_NODISCARD inline bool isNoModRef(const ModRefInfo MRI) { + return MRI == MRI_NoModRef; +} +LLVM_NODISCARD inline bool isModOrRefSet(const ModRefInfo MRI) { + return MRI & MRI_ModRef; +} +LLVM_NODISCARD inline bool isModAndRefSet(const ModRefInfo MRI) { + return (MRI & MRI_ModRef) == MRI_ModRef; +} +LLVM_NODISCARD inline bool isModSet(const ModRefInfo MRI) { + return MRI & MRI_Mod; +} +LLVM_NODISCARD inline bool isRefSet(const ModRefInfo MRI) { + return MRI & MRI_Ref; +} + +LLVM_NODISCARD inline ModRefInfo setRef(const ModRefInfo MRI) { + return ModRefInfo(MRI | MRI_Ref); +} +LLVM_NODISCARD inline ModRefInfo setMod(const ModRefInfo MRI) { + return ModRefInfo(MRI | MRI_Mod); +} +LLVM_NODISCARD inline ModRefInfo setModAndRef(const ModRefInfo MRI) { + return ModRefInfo(MRI | MRI_ModRef); +} +LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) { + return ModRefInfo(MRI & MRI_Ref); +} +LLVM_NODISCARD inline ModRefInfo clearRef(const ModRefInfo MRI) { + return ModRefInfo(MRI & MRI_Mod); +} +LLVM_NODISCARD inline ModRefInfo unionModRef(const ModRefInfo MRI1, + const ModRefInfo MRI2) { + return ModRefInfo(MRI1 | MRI2); +} +LLVM_NODISCARD inline ModRefInfo intersectModRef(const ModRefInfo MRI1, + const ModRefInfo MRI2) { + return ModRefInfo(MRI1 & MRI2); +} + /// The locations at which a function might access memory. /// /// These are primarily used in conjunction with the \c AccessKind bits to @@ -187,6 +228,15 @@ enum FunctionModRefBehavior { FMRB_UnknownModRefBehavior = FMRL_Anywhere | MRI_ModRef }; +// Wrapper method strips bits significant only in FunctionModRefBehavior, +// to obtain a valid ModRefInfo. The benefit of using the wrapper is that if +// ModRefInfo enum changes, the wrapper can be updated to & with the new enum +// entry with all bits set to 1. +LLVM_NODISCARD inline ModRefInfo +createModRefInfo(const FunctionModRefBehavior FMRB) { + return ModRefInfo(FMRB & MRI_ModRef); +} + class AAResults { public: // Make these results default constructable and movable. We have to spell @@ -520,14 +570,7 @@ class AAResults { const Optional &OptLoc) { if (OptLoc == None) { if (auto CS = ImmutableCallSite(I)) { - auto MRB = getModRefBehavior(CS); - if ((MRB & MRI_ModRef) == MRI_ModRef) - return MRI_ModRef; - if (MRB & MRI_Ref) - return MRI_Ref; - if (MRB & MRI_Mod) - return MRI_Mod; - return MRI_NoModRef; + return createModRefInfo(getModRefBehavior(CS)); } } @@ -570,7 +613,7 @@ class AAResults { /// \brief Return information about whether a particular call site modifies /// or reads the specified memory location \p MemLoc before instruction \p I - /// in a BasicBlock. A ordered basic block \p OBB can be used to speed up + /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up /// instruction ordering queries inside the BasicBlock containing \p I. ModRefInfo callCapturesBefore(const Instruction *I, const MemoryLocation &MemLoc, DominatorTree *DT, diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h index b0859567db41e..ac2c055ab1452 100644 --- a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h +++ b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h @@ -277,7 +277,7 @@ bool InstructionSelector::executeMatchTable( return false; for (const auto &MMO : State.MIs[InsnID]->memoperands()) - if (isAtLeastOrStrongerThan(MMO->getOrdering(), Ordering)) + if (!isStrongerThan(Ordering, MMO->getOrdering())) if (handleReject() == RejectAndGiveUp) return false; break; diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 1ea9dba5e990d..bed7c72489227 100644 --- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -571,30 +571,6 @@ class MachineIRBuilder { MachineInstrBuilder buildStore(unsigned Val, unsigned Addr, MachineMemOperand &MMO); - /// Build and insert `Res = G_ATOMIC_LOAD Addr, MMO`. - /// - /// Loads the value stored at \p Addr. Puts the result in \p Res. - /// - /// \pre setBasicBlock or setMI must have been called. - /// \pre \p Res must be a generic virtual register. - /// \pre \p Addr must be a generic virtual register with pointer type. - /// - /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildAtomicLoad(unsigned Res, unsigned Addr, - MachineMemOperand &MMO); - - /// Build and insert `G_ATOMIC_STORE Val, Addr, MMO`. - /// - /// Stores the value \p Val to \p Addr. - /// - /// \pre setBasicBlock or setMI must have been called. - /// \pre \p Val must be a generic virtual register. - /// \pre \p Addr must be a generic virtual register with pointer type. - /// - /// \return a MachineInstrBuilder for the newly created instruction. - MachineInstrBuilder buildAtomicStore(unsigned Val, unsigned Addr, - MachineMemOperand &MMO); - /// Build and insert `Res0, ... = G_EXTRACT Src, Idx0`. /// /// \pre setBasicBlock or setMI must have been called. diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index 9e4865ff2c26f..d256849be9afb 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -186,7 +186,8 @@ namespace ISD { /// BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways. /// Given two values of the same integer value type, this produces a value /// twice as big. Like EXTRACT_ELEMENT, this can only be used before - /// legalization. + /// legalization. The lower part of the composite value should be in + /// element 0 and the upper part should be in element 1. BUILD_PAIR, /// MERGE_VALUES - This node takes multiple discrete operands and returns diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index 9521c277988a8..f887517217e18 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -115,7 +115,7 @@ class MachineFrameInfo { /// slot can't alias any LLVM IR value. This is very similar to a Spill /// Slot, but is created by statepoint lowering is SelectionDAG, not the /// register allocator. - bool isStatepointSpillSlot; + bool isStatepointSpillSlot = false; /// Identifier for stack memory type analagous to address space. If this is /// non-0, the meaning is target defined. Offsets cannot be directly @@ -131,7 +131,7 @@ class MachineFrameInfo { // If true, the object was mapped into the local frame // block and doesn't need additional handling for allocation beyond that. - bool PreAllocated; + bool PreAllocated = false; // If true, an LLVM IR value might point to this object. // Normally, spill slots and fixed-offset objects don't alias IR-accessible @@ -140,17 +140,17 @@ class MachineFrameInfo { bool isAliased; /// If true, the object has been zero-extended. - bool isZExt; + bool isZExt = false; /// If true, the object has been zero-extended. - bool isSExt; - - StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM, - bool isSS, const AllocaInst *Val, bool Aliased, uint8_t ID = 0) - : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM), - isSpillSlot(isSS), isStatepointSpillSlot(false), StackID(ID), - Alloca(Val), - PreAllocated(false), isAliased(Aliased), isZExt(false), isSExt(false) {} + bool isSExt = false; + + StackObject(uint64_t Size, unsigned Alignment, int64_t SPOffset, + bool IsImmutable, bool IsSpillSlot, const AllocaInst *Alloca, + bool IsAliased, uint8_t StackID = 0) + : SPOffset(SPOffset), Size(Size), Alignment(Alignment), + isImmutable(IsImmutable), isSpillSlot(IsSpillSlot), + StackID(StackID), Alloca(Alloca), isAliased(IsAliased) {} }; /// The alignment of the stack. @@ -573,13 +573,13 @@ class MachineFrameInfo { /// All fixed objects should be created before other objects are created for /// efficiency. By default, fixed objects are not pointed to by LLVM IR /// values. This returns an index with a negative value. - int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable, + int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased = false); /// Create a spill slot at a fixed location on the stack. /// Returns an index with a negative value. int CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset, - bool Immutable = false); + bool IsImmutable = false); /// Returns true if the specified index corresponds to a fixed stack object. bool isFixedObjectIndex(int ObjectIdx) const { @@ -605,10 +605,10 @@ class MachineFrameInfo { } /// Marks the immutability of an object. - void setIsImmutableObjectIndex(int ObjectIdx, bool Immutable) { + void setIsImmutableObjectIndex(int ObjectIdx, bool IsImmutable) { assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && "Invalid Object Idx!"); - Objects[ObjectIdx+NumFixedObjects].isImmutable = Immutable; + Objects[ObjectIdx+NumFixedObjects].isImmutable = IsImmutable; } /// Returns true if the specified index corresponds to a spill slot. @@ -660,7 +660,7 @@ class MachineFrameInfo { /// Create a new statically sized stack object, returning /// a nonnegative identifier to represent it. - int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS, + int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSpillSlot, const AllocaInst *Alloca = nullptr, uint8_t ID = 0); /// Create a new statically sized stack object that represents a spill slot, diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h index be9b89eb77efe..3be94f8021701 100644 --- a/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/include/llvm/CodeGen/MachineRegisterInfo.h @@ -84,14 +84,15 @@ class MachineRegisterInfo { /// all registers that were disabled are removed from the list. SmallVector UpdatedCSRs; - /// RegAllocHints - This vector records register allocation hints for virtual - /// registers. For each virtual register, it keeps a register and hint type - /// pair making up the allocation hint. Hint type is target specific except - /// for the value 0 which means the second value of the pair is the preferred - /// register for allocation. For example, if the hint is <0, 1024>, it means - /// the allocator should prefer the physical register allocated to the virtual - /// register of the hint. - IndexedMap, VirtReg2IndexFunctor> RegAllocHints; + /// RegAllocHints - This vector records register allocation hints for + /// virtual registers. For each virtual register, it keeps a pair of hint + /// type and hints vector making up the allocation hints. Only the first + /// hint may be target specific, and in that case this is reflected by the + /// first member of the pair being non-zero. If the hinted register is + /// virtual, it means the allocator should prefer the physical register + /// allocated to it if any. + IndexedMap>, + VirtReg2IndexFunctor> RegAllocHints; /// PhysRegUseDefLists - This is an array of the head of the use/def list for /// physical registers. @@ -706,35 +707,61 @@ class MachineRegisterInfo { void clearVirtRegs(); /// setRegAllocationHint - Specify a register allocation hint for the - /// specified virtual register. + /// specified virtual register. This is typically used by target, and in case + /// of an earlier hint it will be overwritten. void setRegAllocationHint(unsigned VReg, unsigned Type, unsigned PrefReg) { assert(TargetRegisterInfo::isVirtualRegister(VReg)); RegAllocHints[VReg].first = Type; - RegAllocHints[VReg].second = PrefReg; + RegAllocHints[VReg].second.clear(); + RegAllocHints[VReg].second.push_back(PrefReg); } - /// Specify the preferred register allocation hint for the specified virtual - /// register. + /// addRegAllocationHint - Add a register allocation hint to the hints + /// vector for VReg. + void addRegAllocationHint(unsigned VReg, unsigned PrefReg) { + assert(TargetRegisterInfo::isVirtualRegister(VReg)); + RegAllocHints[VReg].second.push_back(PrefReg); + } + + /// Specify the preferred (target independent) register allocation hint for + /// the specified virtual register. void setSimpleHint(unsigned VReg, unsigned PrefReg) { setRegAllocationHint(VReg, /*Type=*/0, PrefReg); } + void clearSimpleHint(unsigned VReg) { + assert (RegAllocHints[VReg].first == 0 && + "Expected to clear a non-target hint!"); + RegAllocHints[VReg].second.clear(); + } + /// getRegAllocationHint - Return the register allocation hint for the - /// specified virtual register. + /// specified virtual register. If there are many hints, this returns the + /// one with the greatest weight. std::pair getRegAllocationHint(unsigned VReg) const { assert(TargetRegisterInfo::isVirtualRegister(VReg)); - return RegAllocHints[VReg]; + unsigned BestHint = (RegAllocHints[VReg].second.size() ? + RegAllocHints[VReg].second[0] : 0); + return std::pair(RegAllocHints[VReg].first, BestHint); } - /// getSimpleHint - Return the preferred register allocation hint, or 0 if a - /// standard simple hint (Type == 0) is not set. + /// getSimpleHint - same as getRegAllocationHint except it will only return + /// a target independent hint. unsigned getSimpleHint(unsigned VReg) const { assert(TargetRegisterInfo::isVirtualRegister(VReg)); std::pair Hint = getRegAllocationHint(VReg); return Hint.first ? 0 : Hint.second; } + /// getRegAllocationHints - Return a reference to the vector of all + /// register allocation hints for VReg. + const std::pair> + &getRegAllocationHints(unsigned VReg) const { + assert(TargetRegisterInfo::isVirtualRegister(VReg)); + return RegAllocHints[VReg]; + } + /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the /// specified register as undefined which causes the DBG_VALUE to be /// deleted during LiveDebugVariables analysis. diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h index 4210f58ddb03d..f9a61b8bf1ab7 100644 --- a/include/llvm/CodeGen/TargetLowering.h +++ b/include/llvm/CodeGen/TargetLowering.h @@ -1360,6 +1360,12 @@ class TargetLoweringBase { /// getIRStackGuard returns nullptr. virtual Value *getSDagStackGuard(const Module &M) const; + /// If this function returns true, stack protection checks should XOR the + /// frame pointer (or whichever pointer is used to address locals) into the + /// stack guard value before checking it. getIRStackGuard must return nullptr + /// if this returns true. + virtual bool useStackGuardXorFP() const { return false; } + /// If the target has a standard stack protection check function that /// performs validation and error handling, returns the function. Otherwise, /// returns nullptr. Must be previously inserted by insertSSPDeclarations. @@ -3487,6 +3493,11 @@ class TargetLowering : public TargetLoweringBase { return false; } + virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const { + llvm_unreachable("not implemented for this target"); + } + /// Lower TLS global address SDNode for target independent emulated TLS model. virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const; diff --git a/include/llvm/CodeGen/TargetOpcodes.def b/include/llvm/CodeGen/TargetOpcodes.def index 3cca0eb567e0c..d3e8483798a7f 100644 --- a/include/llvm/CodeGen/TargetOpcodes.def +++ b/include/llvm/CodeGen/TargetOpcodes.def @@ -265,12 +265,6 @@ HANDLE_TARGET_OPCODE(G_LOAD) /// Generic store. HANDLE_TARGET_OPCODE(G_STORE) -/// Generic atomic load -HANDLE_TARGET_OPCODE(G_ATOMIC_LOAD) - -/// Generic atomic store -HANDLE_TARGET_OPCODE(G_ATOMIC_STORE) - /// Generic atomic cmpxchg with internal success check. HANDLE_TARGET_OPCODE(G_ATOMIC_CMPXCHG_WITH_SUCCESS) diff --git a/include/llvm/CodeGen/TargetRegisterInfo.h b/include/llvm/CodeGen/TargetRegisterInfo.h index 92d38d51feefe..cc612a42d756e 100644 --- a/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/include/llvm/CodeGen/TargetRegisterInfo.h @@ -785,11 +785,10 @@ class TargetRegisterInfo : public MCRegisterInfo { /// as returned from RegisterClassInfo::getOrder(). The hint registers must /// come from Order, and they must not be reserved. /// - /// The default implementation of this function can resolve - /// target-independent hints provided to MRI::setRegAllocationHint with - /// HintType == 0. Targets that override this function should defer to the - /// default implementation if they have no reason to change the allocation - /// order for VirtReg. There may be target-independent hints. + /// The default implementation of this function will only add target + /// independent register allocation hints. Targets that override this + /// function should typically call this default implementation as well and + /// expect to see generic copy hints added. virtual bool getRegAllocationHints(unsigned VirtReg, ArrayRef Order, SmallVectorImpl &Hints, @@ -808,6 +807,13 @@ class TargetRegisterInfo : public MCRegisterInfo { // Do nothing. } + /// The creation of multiple copy hints have been implemented in + /// weightCalcHelper(), but since this affects so many tests for many + /// targets, this is temporarily disabled per default. THIS SHOULD BE + /// "GENERAL GOODNESS" and hopefully all targets will update their tests + /// and enable this soon. This hook should then be removed. + virtual bool enableMultipleCopyHints() const { return false; } + /// Allow the target to reverse allocation order of local live ranges. This /// will generally allocate shorter local live ranges first. For targets with /// many registers, this could reduce regalloc compile time by a large diff --git a/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h index 2618fc6ba753c..9030918ebbb32 100644 --- a/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h +++ b/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h @@ -16,6 +16,7 @@ #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h" #include "llvm/DebugInfo/CodeView/TypeCollection.h" +#include "llvm/DebugInfo/CodeView/TypeHashing.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/Allocator.h" #include @@ -27,13 +28,6 @@ namespace llvm { namespace codeview { class ContinuationRecordBuilder; -class TypeHasher; - -struct HashedType { - hash_code Hash; - ArrayRef Data; - TypeIndex Index; -}; class MergingTypeTableBuilder : public TypeCollection { /// Storage for records. These need to outlive the TypeTableBuilder. @@ -45,14 +39,11 @@ class MergingTypeTableBuilder : public TypeCollection { SimpleTypeSerializer SimpleSerializer; /// Hash table. - DenseSet HashedRecords; + DenseMap HashedRecords; /// Contains a list of all records indexed by TypeIndex.toArrayIndex(). SmallVector, 2> SeenRecords; - /// Contains a list of all hash codes index by TypeIndex.toArrayIndex(). - SmallVector SeenHashes; - public: explicit MergingTypeTableBuilder(BumpPtrAllocator &Storage); ~MergingTypeTableBuilder(); @@ -73,7 +64,6 @@ class MergingTypeTableBuilder : public TypeCollection { BumpPtrAllocator &getAllocator() { return RecordStorage; } ArrayRef> records() const; - ArrayRef hashes() const; TypeIndex insertRecordAs(hash_code Hash, ArrayRef &Record); TypeIndex insertRecordBytes(ArrayRef &Record); diff --git a/include/llvm/DebugInfo/CodeView/TypeHashing.h b/include/llvm/DebugInfo/CodeView/TypeHashing.h new file mode 100644 index 0000000000000..8f7d2abadcaed --- /dev/null +++ b/include/llvm/DebugInfo/CodeView/TypeHashing.h @@ -0,0 +1,119 @@ +//===- TypeHashing.h ---------------------------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H +#define LLVM_DEBUGINFO_CODEVIEW_TYPEHASHING_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" + +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/Hashing.h" + +namespace llvm { +namespace codeview { + +/// A locally hashed type represents a straightforward hash code of a serialized +/// record. The record is simply serialized, and then the bytes are hashed by +/// a standard algorithm. This is sufficient for the case of de-duplicating +/// records within a single sequence of types, because if two records both have +/// a back-reference to the same type in the same stream, they will both have +/// the same numeric value for the TypeIndex of the back reference. +struct LocallyHashedType { + hash_code Hash; + ArrayRef RecordData; + + static LocallyHashedType hashType(ArrayRef RecordData); +}; + +/// A globally hashed type represents a hash value that is sufficient to +/// uniquely identify a record across multiple type streams or type sequences. +/// This works by, for any given record A which references B, replacing the +/// TypeIndex that refers to B with a previously-computed global hash for B. As +/// this is a recursive algorithm (e.g. the global hash of B also depends on the +/// global hashes of the types that B refers to), a global hash can uniquely +/// identify identify that A occurs in another stream that has a completely +/// different graph structure. Although the hash itself is slower to compute, +/// probing is much faster with a globally hashed type, because the hash itself +/// is considered "as good as" the original type. Since type records can be +/// quite large, this makes the equality comparison of the hash much faster than +/// equality comparison of a full record. +struct GloballyHashedType { + GloballyHashedType() = default; + GloballyHashedType(StringRef H) + : GloballyHashedType(ArrayRef(H.bytes_begin(), H.bytes_end())) {} + GloballyHashedType(ArrayRef H) { + assert(H.size() == 20); + ::memcpy(Hash.data(), H.data(), 20); + } + std::array Hash; + + /// Given a sequence of bytes representing a record, compute a global hash for + /// this record. Due to the nature of global hashes incorporating the hashes + /// of referenced records, this function requires a list of types and ids + /// that RecordData might reference, indexable by TypeIndex. + static GloballyHashedType hashType(ArrayRef RecordData, + ArrayRef PreviousTypes, + ArrayRef PreviousIds); + + /// Given a sequence of combined type and ID records, compute global hashes + /// for each of them, returning the results in a vector of hashed types. + template + static std::vector hashTypes(Range &&Records) { + std::vector Hashes; + Hashes.reserve(std::distance(std::begin(Records), std::end(Records))); + for (const auto &R : Records) + Hashes.push_back(hashType(R, Hashes, Hashes)); + + return Hashes; + } +}; +} // namespace codeview + +template <> struct DenseMapInfo { + static codeview::LocallyHashedType Empty; + static codeview::LocallyHashedType Tombstone; + + static codeview::LocallyHashedType getEmptyKey() { return Empty; } + + static codeview::LocallyHashedType getTombstoneKey() { return Tombstone; } + + static unsigned getHashValue(codeview::LocallyHashedType Val) { + return Val.Hash; + } + + static bool isEqual(codeview::LocallyHashedType LHS, + codeview::LocallyHashedType RHS) { + if (LHS.Hash != RHS.Hash) + return false; + return LHS.RecordData == RHS.RecordData; + } +}; + +template <> struct DenseMapInfo { + static codeview::GloballyHashedType Empty; + static codeview::GloballyHashedType Tombstone; + + static codeview::GloballyHashedType getEmptyKey() { return Empty; } + + static codeview::GloballyHashedType getTombstoneKey() { return Tombstone; } + + static unsigned getHashValue(codeview::GloballyHashedType Val) { + return *reinterpret_cast(Val.Hash.data()); + } + + static bool isEqual(codeview::GloballyHashedType LHS, + codeview::GloballyHashedType RHS) { + return LHS.Hash == RHS.Hash; + } +}; + +} // namespace llvm + +#endif diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index a780a49bbbf82..508bdd395f74e 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -334,6 +334,11 @@ class PointerRecord : public TypeRecord { uint32_t Attrs; Optional MemberInfo; + void setAttrs(PointerKind PK, PointerMode PM, PointerOptions PO, + uint8_t Size) { + Attrs = calcAttrs(PK, PM, PO, Size); + } + private: static uint32_t calcAttrs(PointerKind PK, PointerMode PM, PointerOptions PO, uint8_t Size) { diff --git a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h new file mode 100644 index 0000000000000..587bfa993f3a8 --- /dev/null +++ b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h @@ -0,0 +1,133 @@ +//===- SymbolStringPool.h - Multi-threaded pool for JIT symbols -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains a multi-threaded string pool suitable for use with ORC. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H +#define LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H + +#include "llvm/ADT/StringMap.h" +#include +#include + +namespace llvm { +namespace orc { + +class SymbolStringPtr; + +/// @brief String pool for symbol names used by the JIT. +class SymbolStringPool { + friend class SymbolStringPtr; +public: + /// @brief Create a symbol string pointer from the given string. + SymbolStringPtr intern(StringRef S); + + /// @brief Remove from the pool any entries that are no longer referenced. + void clearDeadEntries(); + + /// @brief Returns true if the pool is empty. + bool empty() const; +private: + using RefCountType = std::atomic; + using PoolMap = StringMap; + using PoolMapEntry = StringMapEntry; + mutable std::mutex PoolMutex; + PoolMap Pool; +}; + +/// @brief Pointer to a pooled string representing a symbol name. +class SymbolStringPtr { + friend class SymbolStringPool; +public: + SymbolStringPtr() = default; + SymbolStringPtr(const SymbolStringPtr &Other) + : S(Other.S) { + if (S) + ++S->getValue(); + } + + SymbolStringPtr& operator=(const SymbolStringPtr &Other) { + if (S) + --S->getValue(); + S = Other.S; + if (S) + ++S->getValue(); + return *this; + } + + SymbolStringPtr(SymbolStringPtr &&Other) : S(nullptr) { + std::swap(S, Other.S); + } + + SymbolStringPtr& operator=(SymbolStringPtr &&Other) { + if (S) + --S->getValue(); + S = nullptr; + std::swap(S, Other.S); + return *this; + } + + ~SymbolStringPtr() { + if (S) + --S->getValue(); + } + + bool operator==(const SymbolStringPtr &Other) const { + return S == Other.S; + } + + bool operator!=(const SymbolStringPtr &Other) const { + return !(*this == Other); + } + +private: + + SymbolStringPtr(SymbolStringPool::PoolMapEntry *S) + : S(S) { + if (S) + ++S->getValue(); + } + + SymbolStringPool::PoolMapEntry *S = nullptr; +}; + +SymbolStringPtr SymbolStringPool::intern(StringRef S) { + std::lock_guard Lock(PoolMutex); + auto I = Pool.find(S); + if (I != Pool.end()) + return SymbolStringPtr(&*I); + + bool Added; + std::tie(I, Added) = Pool.try_emplace(S, 0); + assert(Added && "Insert should always succeed here"); + return SymbolStringPtr(&*I); +} + +void SymbolStringPool::clearDeadEntries() { + std::lock_guard Lock(PoolMutex); + for (auto I = Pool.begin(), E = Pool.end(); I != E;) { + auto Tmp = std::next(I); + if (I->second == 0) + Pool.erase(I); + I = Tmp; + } +} + +bool SymbolStringPool::empty() const { + std::lock_guard Lock(PoolMutex); + return Pool.empty(); +} + +} // end namespace orc + +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h index ff6495e7f0757..029bfe578af24 100644 --- a/include/llvm/IR/ConstantRange.h +++ b/include/llvm/IR/ConstantRange.h @@ -96,9 +96,9 @@ class LLVM_NODISCARD ConstantRange { /// /// NB! The returned set does *not* contain **all** possible values of X for /// which "X BinOpC Y" does not wrap -- some viable values of X may be - /// missing, so you cannot use this to constrain X's range. E.g. in the last - /// example, "(-2) + 1" is both nsw and nuw (so the "X" could be -2), but (-2) - /// is not in the set returned. + /// missing, so you cannot use this to constrain X's range. E.g. in the + /// fourth example, "(-2) + 1" is both nsw and nuw (so the "X" could be -2), + /// but (-2) is not in the set returned. /// /// Examples: /// typedef OverflowingBinaryOperator OBO; @@ -109,6 +109,10 @@ class LLVM_NODISCARD ConstantRange { /// MGNR(Add, [i8 1, 2), OBO::NoUnsignedWrap | OBO::NoSignedWrap) /// == [0,INT_MAX) /// MGNR(Add, [i8 -1, 6), OBO::NoSignedWrap) == [INT_MIN+1, INT_MAX-4) + /// MGNR(Sub, [i8 1, 2), OBO::NoSignedWrap) == [-127, 128) + /// MGNR(Sub, [i8 1, 2), OBO::NoUnsignedWrap) == [1, 0) + /// MGNR(Sub, [i8 1, 2), OBO::NoUnsignedWrap | OBO::NoSignedWrap) + /// == [1,INT_MAX) static ConstantRange makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp, const ConstantRange &Other, unsigned NoWrapKind); diff --git a/include/llvm/IR/IntrinsicsSystemZ.td b/include/llvm/IR/IntrinsicsSystemZ.td index 98065bc51d992..8c078a629e12b 100644 --- a/include/llvm/IR/IntrinsicsSystemZ.td +++ b/include/llvm/IR/IntrinsicsSystemZ.td @@ -198,14 +198,14 @@ multiclass SystemZQuaternaryIntCCBHF { let TargetPrefix = "s390" in { def int_s390_tbegin : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoDuplicate]>; + [IntrNoDuplicate, IntrWriteMem]>; def int_s390_tbegin_nofloat : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoDuplicate]>; + [IntrNoDuplicate, IntrWriteMem]>; def int_s390_tbeginc : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoDuplicate]>; + [IntrNoDuplicate, IntrWriteMem]>; def int_s390_tabort : Intrinsic<[], [llvm_i64_ty], [IntrNoReturn, Throws]>; @@ -217,7 +217,7 @@ let TargetPrefix = "s390" in { Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; def int_s390_ntstg : Intrinsic<[], [llvm_i64_ty, llvm_ptr64_ty], - [IntrArgMemOnly]>; + [IntrArgMemOnly, IntrWriteMem]>; def int_s390_ppa_txassist : GCCBuiltin<"__builtin_tx_assist">, Intrinsic<[], [llvm_i32_ty]>; @@ -260,9 +260,7 @@ let TargetPrefix = "s390" in { def int_s390_vstl : GCCBuiltin<"__builtin_s390_vstl">, Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty], - // In fact write-only but there's no property - // for that. - [IntrArgMemOnly]>; + [IntrArgMemOnly, IntrWriteMem]>; defm int_s390_vupl : SystemZUnaryExtBHWF<"vupl">; defm int_s390_vupll : SystemZUnaryExtBHF<"vupll">; @@ -413,9 +411,7 @@ let TargetPrefix = "s390" in { def int_s390_vstrl : GCCBuiltin<"__builtin_s390_vstrl">, Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty], - // In fact write-only but there's no property - // for that. - [IntrArgMemOnly]>; + [IntrArgMemOnly, IntrWriteMem]>; } //===----------------------------------------------------------------------===// diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 459463003c722..c8ede72fb7e1f 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3738,15 +3738,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_kxnor_w : GCCBuiltin<"__builtin_ia32_kxnorhi">, Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">, - Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], - [IntrNoMem]>; - def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; diff --git a/include/llvm/Support/TarWriter.h b/include/llvm/Support/TarWriter.h index 44bdcaf2c4658..639f61b538922 100644 --- a/include/llvm/Support/TarWriter.h +++ b/include/llvm/Support/TarWriter.h @@ -11,6 +11,7 @@ #define LLVM_SUPPORT_TAR_WRITER_H #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" @@ -26,6 +27,7 @@ class TarWriter { TarWriter(int FD, StringRef BaseDir); raw_fd_ostream OS; std::string BaseDir; + StringSet<> Files; }; } diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td index 3b2e8f2d08b87..28c90bf22767d 100644 --- a/include/llvm/Target/GenericOpcodes.td +++ b/include/llvm/Target/GenericOpcodes.td @@ -484,28 +484,6 @@ def G_STORE : GenericInstruction { let mayStore = 1; } -// Generic atomic load. Expects a MachineMemOperand in addition to explicit -// operands. Technically, we could have handled this as a G_LOAD, however we -// decided to keep it separate on the basis that atomic loads tend to have -// very different handling to non-atomic loads. -def G_ATOMIC_LOAD : GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins ptype1:$addr); - let hasSideEffects = 0; - let mayLoad = 1; -} - -// Generic atomic store. Expects a MachineMemOperand in addition to explicit -// operands. Technically, we could have handled this as a G_STORE, however we -// decided to keep it separate on the basis that atomic stores tend to have -// very different handling to non-atomic stores. -def G_ATOMIC_STORE : GenericInstruction { - let OutOperandList = (outs); - let InOperandList = (ins type0:$src, ptype1:$addr); - let hasSideEffects = 0; - let mayStore = 1; -} - // Generic atomic cmpxchg with internal success check. Expects a // MachineMemOperand in addition to explicit operands. def G_ATOMIC_CMPXCHG_WITH_SUCCESS : GenericInstruction { diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 71c8ce6d20c8e..575f228cd7733 100644 --- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -23,6 +23,11 @@ class GINodeEquiv { Instruction I = i; SDNode Node = node; + + // SelectionDAG has separate nodes for atomic and non-atomic memory operations + // (ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE) but GlobalISel + // stores this information in the MachineMemoryOperand. + bit CheckMMOIsNonAtomic = 0; } // These are defined in the same order as the G_* instructions. @@ -75,16 +80,19 @@ def : GINodeEquiv; // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some // complications that tablegen must take care of. For example, Predicates such // as isSignExtLoad require that this is not a perfect 1:1 mapping since a -// sign-extending load is (G_SEXT (G_LOAD x)) in GlobalISel. -def : GINodeEquiv; +// sign-extending load is (G_SEXT (G_LOAD x)) in GlobalISel. Additionally, +// G_LOAD handles both atomic and non-atomic loads where as SelectionDAG had +// separate nodes for them. This GINodeEquiv maps the non-atomic loads to +// G_LOAD with a non-atomic MachineMemOperand. +def : GINodeEquiv { let CheckMMOIsNonAtomic = 1; } // Broadly speaking G_STORE is equivalent to ISD::STORE but there are some // complications that tablegen must take care of. For example, predicates such // as isTruncStore require that this is not a perfect 1:1 mapping since a -// truncating store is (G_STORE (G_TRUNCATE x)) in GlobalISel. -def : GINodeEquiv; - -def : GINodeEquiv; -def : GINodeEquiv; +// truncating store is (G_STORE (G_TRUNCATE x)) in GlobalISel. Additionally, +// G_STORE handles both atomic and non-atomic stores where as SelectionDAG had +// separate nodes for them. This GINodeEquiv maps the non-atomic stores to +// G_STORE with a non-atomic MachineMemOperand. +def : GINodeEquiv { let CheckMMOIsNonAtomic = 1; } def : GINodeEquiv; def : GINodeEquiv; diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index 897f89d311489..97ea0fc40bd3c 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -122,10 +122,10 @@ ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { ModRefInfo Result = MRI_ModRef; for (const auto &AA : AAs) { - Result = ModRefInfo(Result & AA->getArgModRefInfo(CS, ArgIdx)); + Result = intersectModRef(Result, AA->getArgModRefInfo(CS, ArgIdx)); // Early-exit the moment we reach the bottom of the lattice. - if (Result == MRI_NoModRef) + if (isNoModRef(Result)) return Result; } @@ -146,8 +146,9 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) { // is that if the call references what this instruction // defines, it must be clobbered by this location. const MemoryLocation DefLoc = MemoryLocation::get(I); - if (getModRefInfo(Call, DefLoc) != MRI_NoModRef) - return MRI_ModRef; + ModRefInfo MR = getModRefInfo(Call, DefLoc); + if (isModOrRefSet(MR)) + return setModAndRef(MR); } return MRI_NoModRef; } @@ -157,10 +158,10 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, ModRefInfo Result = MRI_ModRef; for (const auto &AA : AAs) { - Result = ModRefInfo(Result & AA->getModRefInfo(CS, Loc)); + Result = intersectModRef(Result, AA->getModRefInfo(CS, Loc)); // Early-exit the moment we reach the bottom of the lattice. - if (Result == MRI_NoModRef) + if (isNoModRef(Result)) return Result; } @@ -172,9 +173,9 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, return MRI_NoModRef; if (onlyReadsMemory(MRB)) - Result = ModRefInfo(Result & MRI_Ref); + Result = clearMod(Result); else if (doesNotReadMemory(MRB)) - Result = ModRefInfo(Result & MRI_Mod); + Result = clearRef(Result); if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) { bool DoesAlias = false; @@ -190,20 +191,21 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, if (ArgAlias != NoAlias) { ModRefInfo ArgMask = getArgModRefInfo(CS, ArgIdx); DoesAlias = true; - AllArgsMask = ModRefInfo(AllArgsMask | ArgMask); + AllArgsMask = unionModRef(AllArgsMask, ArgMask); } } } + // Return MRI_NoModRef if no alias found with any argument. if (!DoesAlias) return MRI_NoModRef; - Result = ModRefInfo(Result & AllArgsMask); + // Logical & between other AA analyses and argument analysis. + Result = intersectModRef(Result, AllArgsMask); } // If Loc is a constant memory location, the call definitely could not // modify the memory location. - if ((Result & MRI_Mod) && - pointsToConstantMemory(Loc, /*OrLocal*/ false)) - Result = ModRefInfo(Result & ~MRI_Mod); + if (isModSet(Result) && pointsToConstantMemory(Loc, /*OrLocal*/ false)) + Result = clearMod(Result); return Result; } @@ -213,10 +215,10 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, ModRefInfo Result = MRI_ModRef; for (const auto &AA : AAs) { - Result = ModRefInfo(Result & AA->getModRefInfo(CS1, CS2)); + Result = intersectModRef(Result, AA->getModRefInfo(CS1, CS2)); // Early-exit the moment we reach the bottom of the lattice. - if (Result == MRI_NoModRef) + if (isNoModRef(Result)) return Result; } @@ -239,9 +241,9 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, // If CS1 only reads memory, the only dependence on CS2 can be // from CS1 reading memory written by CS2. if (onlyReadsMemory(CS1B)) - Result = ModRefInfo(Result & MRI_Ref); + Result = clearMod(Result); else if (doesNotReadMemory(CS1B)) - Result = ModRefInfo(Result & MRI_Mod); + Result = clearRef(Result); // If CS2 only access memory through arguments, accumulate the mod/ref // information from CS1's references to the memory referenced by @@ -256,17 +258,23 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I); auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, TLI); - // ArgMask indicates what CS2 might do to CS2ArgLoc, and the dependence - // of CS1 on that location is the inverse. - ModRefInfo ArgMask = getArgModRefInfo(CS2, CS2ArgIdx); - if (ArgMask == MRI_Mod) + // ArgModRefCS2 indicates what CS2 might do to CS2ArgLoc, and the + // dependence of CS1 on that location is the inverse: + // - If CS2 modifies location, dependence exists if CS1 reads or writes. + // - If CS2 only reads location, dependence exists if CS1 writes. + ModRefInfo ArgModRefCS2 = getArgModRefInfo(CS2, CS2ArgIdx); + ModRefInfo ArgMask = MRI_NoModRef; + if (isModSet(ArgModRefCS2)) ArgMask = MRI_ModRef; - else if (ArgMask == MRI_Ref) + else if (isRefSet(ArgModRefCS2)) ArgMask = MRI_Mod; - ArgMask = ModRefInfo(ArgMask & getModRefInfo(CS1, CS2ArgLoc)); + // ModRefCS1 indicates what CS1 might do to CS2ArgLoc, and we use + // above ArgMask to update dependence info. + ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc); + ArgMask = intersectModRef(ArgMask, ModRefCS1); - R = ModRefInfo((R | ArgMask) & Result); + R = intersectModRef(unionModRef(R, ArgMask), Result); if (R == Result) break; } @@ -286,16 +294,14 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I); auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, TLI); - // ArgMask indicates what CS1 might do to CS1ArgLoc; if CS1 might Mod - // CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If CS1 - // might Ref, then we care only about a Mod by CS2. - ModRefInfo ArgMask = getArgModRefInfo(CS1, CS1ArgIdx); - ModRefInfo ArgR = getModRefInfo(CS2, CS1ArgLoc); - if (((ArgMask & MRI_Mod) != MRI_NoModRef && - (ArgR & MRI_ModRef) != MRI_NoModRef) || - ((ArgMask & MRI_Ref) != MRI_NoModRef && - (ArgR & MRI_Mod) != MRI_NoModRef)) - R = ModRefInfo((R | ArgMask) & Result); + // ArgModRefCS1 indicates what CS1 might do to CS1ArgLoc; if CS1 might + // Mod CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If + // CS1 might Ref, then we care only about a Mod by CS2. + ModRefInfo ArgModRefCS1 = getArgModRefInfo(CS1, CS1ArgIdx); + ModRefInfo ModRefCS2 = getModRefInfo(CS2, CS1ArgLoc); + if ((isModSet(ArgModRefCS1) && isModOrRefSet(ModRefCS2)) || + (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2))) + R = intersectModRef(unionModRef(R, ArgModRefCS1), Result); if (R == Result) break; @@ -456,7 +462,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, /// \brief Return information about whether a particular call site modifies /// or reads the specified memory location \p MemLoc before instruction \p I -/// in a BasicBlock. A ordered basic block \p OBB can be used to speed up +/// in a BasicBlock. An ordered basic block \p OBB can be used to speed up /// instruction-ordering queries inside the BasicBlock containing \p I. /// FIXME: this is really just shoring-up a deficiency in alias analysis. /// BasicAA isn't willing to spend linear time determining whether an alloca @@ -538,7 +544,7 @@ bool AAResults::canInstructionRangeModRef(const Instruction &I1, ++E; // Convert from inclusive to exclusive range. for (; I != E; ++I) // Check every instruction in range - if (getModRefInfo(&*I, Loc) & Mode) + if (intersectModRef(getModRefInfo(&*I, Loc), Mode)) return true; return false; } diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp index b575944092a9f..c88e0dd7dc447 100644 --- a/lib/Analysis/AliasSetTracker.cpp +++ b/lib/Analysis/AliasSetTracker.cpp @@ -211,8 +211,8 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size, if (!UnknownInsts.empty()) { for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) if (auto *Inst = getUnknownInst(i)) - if (AA.getModRefInfo(Inst, MemoryLocation(Ptr, Size, AAInfo)) != - MRI_NoModRef) + if (isModOrRefSet( + AA.getModRefInfo(Inst, MemoryLocation(Ptr, Size, AAInfo)))) return true; } @@ -231,15 +231,15 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst, for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) { if (auto *UnknownInst = getUnknownInst(i)) { ImmutableCallSite C1(UnknownInst), C2(Inst); - if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef || - AA.getModRefInfo(C2, C1) != MRI_NoModRef) + if (!C1 || !C2 || isModOrRefSet(AA.getModRefInfo(C1, C2)) || + isModOrRefSet(AA.getModRefInfo(C2, C1))) return true; } } for (iterator I = begin(), E = end(); I != E; ++I) - if (AA.getModRefInfo(Inst, MemoryLocation(I.getPointer(), I.getSize(), - I.getAAInfo())) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo( + Inst, MemoryLocation(I.getPointer(), I.getSize(), I.getAAInfo())))) return true; return false; @@ -572,12 +572,11 @@ AliasSet &AliasSetTracker::mergeAllAliasSets() { AliasAnyAS->AliasAny = true; for (auto Cur : ASVector) { - // If Cur was already forwarding, just forward to the new AS instead. AliasSet *FwdTo = Cur->Forward; if (FwdTo) { Cur->Forward = AliasAnyAS; - AliasAnyAS->addRef(); + AliasAnyAS->addRef(); FwdTo->dropRef(*this); continue; } diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index fb9ece2bd2062..de0b0232773ca 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -809,12 +809,12 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, // Operand aliases 'Object', but call doesn't modify it. Strengthen // initial assumption and keep looking in case if there are more aliases. if (CS.onlyReadsMemory(OperandNo)) { - Result = static_cast(Result | MRI_Ref); + Result = setRef(Result); continue; } // Operand aliases 'Object' but call only writes into it. if (CS.doesNotReadMemory(OperandNo)) { - Result = static_cast(Result | MRI_Mod); + Result = setMod(Result); continue; } // This operand aliases 'Object' and call reads and writes into it. @@ -832,7 +832,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, // routines do not read values visible in the IR. TODO: Consider special // casing realloc and strdup routines which access only their arguments as // well. Or alternatively, replace all of this with inaccessiblememonly once - // that's implemented fully. + // that's implemented fully. auto *Inst = CS.getInstruction(); if (isMallocOrCallocLikeFn(Inst, &TLI)) { // Be conservative if the accessed pointer may alias the allocation - @@ -860,9 +860,9 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, // It's also possible for Loc to alias both src and dest, or neither. ModRefInfo rv = MRI_NoModRef; if (SrcAA != NoAlias) - rv = static_cast(rv | MRI_Ref); + rv = setRef(rv); if (DestAA != NoAlias) - rv = static_cast(rv | MRI_Mod); + rv = setMod(rv); return rv; } @@ -933,10 +933,12 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1, // possibilities for guard intrinsics. if (isIntrinsicCall(CS1, Intrinsic::experimental_guard)) - return getModRefBehavior(CS2) & MRI_Mod ? MRI_Ref : MRI_NoModRef; + return isModSet(ModRefInfo(getModRefBehavior(CS2))) ? MRI_Ref + : MRI_NoModRef; if (isIntrinsicCall(CS2, Intrinsic::experimental_guard)) - return getModRefBehavior(CS1) & MRI_Mod ? MRI_Mod : MRI_NoModRef; + return isModSet(ModRefInfo(getModRefBehavior(CS1))) ? MRI_Mod + : MRI_NoModRef; // The AAResultBase base class has some smarts, lets use them. return AAResultBase::getModRefInfo(CS1, CS2); diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp index 4ef023379bb67..732be0da2598d 100644 --- a/lib/Analysis/GlobalsModRef.cpp +++ b/lib/Analysis/GlobalsModRef.cpp @@ -84,6 +84,7 @@ class GlobalsAAResult::FunctionInfo { /// The bit that flags that this function may read any global. This is /// chosen to mix together with ModRefInfo bits. + /// FIXME: This assumes ModRefInfo lattice will remain 4 bits! enum { MayReadAnyGlobal = 4 }; /// Checks to document the invariants of the bit packing here. @@ -230,9 +231,9 @@ FunctionModRefBehavior GlobalsAAResult::getModRefBehavior(const Function *F) { FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; if (FunctionInfo *FI = getFunctionInfo(F)) { - if (FI->getModRefInfo() == MRI_NoModRef) + if (!isModOrRefSet(FI->getModRefInfo())) Min = FMRB_DoesNotAccessMemory; - else if ((FI->getModRefInfo() & MRI_Mod) == 0) + else if (!isModSet(FI->getModRefInfo())) Min = FMRB_OnlyReadsMemory; } @@ -246,9 +247,9 @@ GlobalsAAResult::getModRefBehavior(ImmutableCallSite CS) { if (!CS.hasOperandBundles()) if (const Function *F = CS.getCalledFunction()) if (FunctionInfo *FI = getFunctionInfo(F)) { - if (FI->getModRefInfo() == MRI_NoModRef) + if (!isModOrRefSet(FI->getModRefInfo())) Min = FMRB_DoesNotAccessMemory; - else if ((FI->getModRefInfo() & MRI_Mod) == 0) + else if (!isModSet(FI->getModRefInfo())) Min = FMRB_OnlyReadsMemory; } @@ -544,7 +545,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) { // Scan the function bodies for explicit loads or stores. for (auto *Node : SCC) { - if (FI.getModRefInfo() == MRI_ModRef) + if (isModAndRefSet(FI.getModRefInfo())) break; // The mod/ref lattice saturates here. // Don't prove any properties based on the implementation of an optnone @@ -554,7 +555,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) { continue; for (Instruction &I : instructions(Node->getFunction())) { - if (FI.getModRefInfo() == MRI_ModRef) + if (isModAndRefSet(FI.getModRefInfo())) break; // The mod/ref lattice saturates here. // We handle calls specially because the graph-relevant aspects are @@ -584,9 +585,9 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) { } } - if ((FI.getModRefInfo() & MRI_Mod) == 0) + if (!isModSet(FI.getModRefInfo())) ++NumReadMemFunctions; - if (FI.getModRefInfo() == MRI_NoModRef) + if (!isModOrRefSet(FI.getModRefInfo())) ++NumNoMemFunctions; // Finally, now that we know the full effect on this SCC, clone the @@ -894,7 +895,7 @@ ModRefInfo GlobalsAAResult::getModRefInfoForArgument(ImmutableCallSite CS, ModRefInfo GlobalsAAResult::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) { - unsigned Known = MRI_ModRef; + ModRefInfo Known = MRI_ModRef; // If we are asking for mod/ref info of a direct call with a pointer to a // global we are tracking, return information if we have it. @@ -904,12 +905,12 @@ ModRefInfo GlobalsAAResult::getModRefInfo(ImmutableCallSite CS, if (const Function *F = CS.getCalledFunction()) if (NonAddressTakenGlobals.count(GV)) if (const FunctionInfo *FI = getFunctionInfo(F)) - Known = FI->getModRefInfoForGlobal(*GV) | - getModRefInfoForArgument(CS, GV); + Known = unionModRef(FI->getModRefInfoForGlobal(*GV), + getModRefInfoForArgument(CS, GV)); - if (Known == MRI_NoModRef) + if (!isModOrRefSet(Known)) return MRI_NoModRef; // No need to query other mod/ref analyses - return ModRefInfo(Known & AAResultBase::getModRefInfo(CS, Loc)); + return intersectModRef(Known, AAResultBase::getModRefInfo(CS, Loc)); } GlobalsAAResult::GlobalsAAResult(const DataLayout &DL, diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp index 78b673be8a0d0..834727c9224d2 100644 --- a/lib/Analysis/Loads.cpp +++ b/lib/Analysis/Loads.cpp @@ -414,7 +414,7 @@ Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy, // If we have alias analysis and it says the store won't modify the loaded // value, ignore the store. - if (AA && (AA->getModRefInfo(SI, StrippedPtr, AccessSize) & MRI_Mod) == 0) + if (AA && !isModSet(AA->getModRefInfo(SI, StrippedPtr, AccessSize))) continue; // Otherwise the store that may or may not alias the pointer, bail out. @@ -426,8 +426,7 @@ Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy, if (Inst->mayWriteToMemory()) { // If alias analysis claims that it really won't modify the load, // ignore it. - if (AA && - (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) & MRI_Mod) == 0) + if (AA && !isModSet(AA->getModRefInfo(Inst, StrippedPtr, AccessSize))) continue; // May modify the pointer, bail out. diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index d41b6be75f21b..c54f676eb94ae 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -212,32 +212,30 @@ MemDepResult MemoryDependenceResults::getCallSiteDependencyFrom( ModRefInfo MR = GetLocation(Inst, Loc, TLI); if (Loc.Ptr) { // A simple instruction. - if (AA.getModRefInfo(CS, Loc) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(CS, Loc))) return MemDepResult::getClobber(Inst); continue; } if (auto InstCS = CallSite(Inst)) { // If these two calls do not interfere, look past it. - switch (AA.getModRefInfo(CS, InstCS)) { - case MRI_NoModRef: + if (isNoModRef(AA.getModRefInfo(CS, InstCS))) { // If the two calls are the same, return InstCS as a Def, so that // CS can be found redundant and eliminated. - if (isReadOnlyCall && !(MR & MRI_Mod) && + if (isReadOnlyCall && !isModSet(MR) && CS.getInstruction()->isIdenticalToWhenDefined(Inst)) return MemDepResult::getDef(Inst); // Otherwise if the two calls don't interact (e.g. InstCS is readnone) // keep scanning. continue; - default: + } else return MemDepResult::getClobber(Inst); - } } // If we could not obtain a pointer for the instruction and the instruction // touches memory then assume that this is a dependency. - if (MR != MRI_NoModRef) + if (isModOrRefSet(MR)) return MemDepResult::getClobber(Inst); } @@ -642,7 +640,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // If alias analysis can tell that this store is guaranteed to not modify // the query pointer, ignore it. Use getModRefInfo to handle cases where // the query pointer points to constant memory etc. - if (AA.getModRefInfo(SI, MemLoc) == MRI_NoModRef) + if (!isModOrRefSet(AA.getModRefInfo(SI, MemLoc))) continue; // Ok, this store might clobber the query pointer. Check to see if it is @@ -688,7 +686,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer. ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc); // If necessary, perform additional analysis. - if (MR == MRI_ModRef) + if (isModAndRefSet(MR)) MR = AA.callCapturesBefore(Inst, MemLoc, &DT, &OBB); switch (MR) { case MRI_NoModRef: diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index 4af18ccb2af80..8fe190e8bcf80 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -262,7 +262,7 @@ static bool instructionClobbersQuery(MemoryDef *MD, if (UseCS) { ModRefInfo I = AA.getModRefInfo(DefInst, UseCS); - return I != MRI_NoModRef; + return isModOrRefSet(I); } if (auto *DefLoad = dyn_cast(DefInst)) { @@ -278,7 +278,7 @@ static bool instructionClobbersQuery(MemoryDef *MD, } } - return AA.getModRefInfo(DefInst, UseLoc) & MRI_Mod; + return isModSet(AA.getModRefInfo(DefInst, UseLoc)); } static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU, @@ -1526,8 +1526,8 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) { // Separate memory aliasing and ordering into two different chains so that we // can precisely represent both "what memory will this read/write/is clobbered // by" and "what instructions can I move this past". - bool Def = bool(ModRef & MRI_Mod) || isOrdered(I); - bool Use = bool(ModRef & MRI_Ref); + bool Def = isModSet(ModRef) || isOrdered(I); + bool Use = isRefSet(ModRef); // It's possible for an instruction to not modify memory at all. During // construction, we ignore them. diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index 106a4a71f93a5..8270f9c10725f 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -548,7 +548,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, m_BitCast(m_Specific(V)))); CmpInst::Predicate Pred; - ConstantInt *C; + uint64_t C; // assume(v = a) if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { @@ -650,51 +650,55 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && - isValidAssumeForContext(I, Q.CxtI, Q.DT)) { + isValidAssumeForContext(I, Q.CxtI, Q.DT) && + C < BitWidth) { KnownBits RHSKnown(BitWidth); computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. - RHSKnown.Zero.lshrInPlace(C->getZExtValue()); + RHSKnown.Zero.lshrInPlace(C); Known.Zero |= RHSKnown.Zero; - RHSKnown.One.lshrInPlace(C->getZExtValue()); + RHSKnown.One.lshrInPlace(C); Known.One |= RHSKnown.One; // assume(~(v << c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && - isValidAssumeForContext(I, Q.CxtI, Q.DT)) { + isValidAssumeForContext(I, Q.CxtI, Q.DT) && + C < BitWidth) { KnownBits RHSKnown(BitWidth); computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. - RHSKnown.One.lshrInPlace(C->getZExtValue()); + RHSKnown.One.lshrInPlace(C); Known.Zero |= RHSKnown.One; - RHSKnown.Zero.lshrInPlace(C->getZExtValue()); + RHSKnown.Zero.lshrInPlace(C); Known.One |= RHSKnown.Zero; // assume(v >> c = a) } else if (match(Arg, m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && - isValidAssumeForContext(I, Q.CxtI, Q.DT)) { + isValidAssumeForContext(I, Q.CxtI, Q.DT) && + C < BitWidth) { KnownBits RHSKnown(BitWidth); computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. - Known.Zero |= RHSKnown.Zero << C->getZExtValue(); - Known.One |= RHSKnown.One << C->getZExtValue(); + Known.Zero |= RHSKnown.Zero << C; + Known.One |= RHSKnown.One << C; // assume(~(v >> c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && - isValidAssumeForContext(I, Q.CxtI, Q.DT)) { + isValidAssumeForContext(I, Q.CxtI, Q.DT) && + C < BitWidth) { KnownBits RHSKnown(BitWidth); computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. - Known.Zero |= RHSKnown.One << C->getZExtValue(); - Known.One |= RHSKnown.Zero << C->getZExtValue(); + Known.Zero |= RHSKnown.One << C; + Known.One |= RHSKnown.Zero << C; // assume(v >=_s c) where c is non-negative } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SGE && diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp index 6a6ec461cf701..f52a068a4cd17 100644 --- a/lib/CodeGen/CalcSpillWeights.cpp +++ b/lib/CodeGen/CalcSpillWeights.cpp @@ -70,13 +70,24 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg, return sub == hsub ? hreg : 0; const TargetRegisterClass *rc = mri.getRegClass(reg); + if (!tri.enableMultipleCopyHints()) { + // Only allow physreg hints in rc. + if (sub == 0) + return rc->contains(hreg) ? hreg : 0; - // Only allow physreg hints in rc. - if (sub == 0) - return rc->contains(hreg) ? hreg : 0; + // reg:sub should match the physreg hreg. + return tri.getMatchingSuperReg(hreg, sub, rc); + } + + unsigned CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg); + if (rc->contains(CopiedPReg)) + return CopiedPReg; + + // Check if reg:sub matches so that a super register could be hinted. + if (sub) + return tri.getMatchingSuperReg(CopiedPReg, sub, rc); - // reg:sub should match the physreg hreg. - return tri.getMatchingSuperReg(hreg, sub, rc); + return 0; } // Check if all values in LI are rematerializable @@ -157,12 +168,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, unsigned numInstr = 0; // Number of instructions using li SmallPtrSet visited; - // Find the best physreg hint and the best virtreg hint. - float bestPhys = 0, bestVirt = 0; - unsigned hintPhys = 0, hintVirt = 0; - - // Don't recompute a target specific hint. - bool noHint = mri.getRegAllocationHint(li.reg).first != 0; + std::pair TargetHint = mri.getRegAllocationHint(li.reg); // Don't recompute spill weight for an unspillable register. bool Spillable = li.isSpillable(); @@ -188,6 +194,36 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, numInstr += 2; } + // CopyHint is a sortable hint derived from a COPY instruction. + struct CopyHint { + unsigned Reg; + float Weight; + bool IsPhys; + unsigned HintOrder; + CopyHint(unsigned R, float W, bool P, unsigned HR) : + Reg(R), Weight(W), IsPhys(P), HintOrder(HR) {} + bool operator<(const CopyHint &rhs) const { + // Always prefer any physreg hint. + if (IsPhys != rhs.IsPhys) + return (IsPhys && !rhs.IsPhys); + if (Weight != rhs.Weight) + return (Weight > rhs.Weight); + + // This is just a temporary way to achive NFC for targets that don't + // enable multiple copy hints. HintOrder should be removed when all + // targets return true in enableMultipleCopyHints(). + return (HintOrder < rhs.HintOrder); + +#if 0 // Should replace the HintOrder check, see above. + // (just for the purpose of maintaining the set) + return Reg < rhs.Reg; +#endif + } + }; + std::set CopyHints; + + // Temporary: see comment for HintOrder above. + unsigned CopyHintOrder = 0; for (MachineRegisterInfo::reg_instr_iterator I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end(); I != E; ) { @@ -227,7 +263,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, } // Get allocation hints from copies. - if (noHint || !mi->isCopy()) + if (!mi->isCopy() || + (TargetHint.first != 0 && !tri.enableMultipleCopyHints())) continue; unsigned hint = copyHint(mi, li.reg, tri, mri); if (!hint) @@ -237,28 +274,30 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start, // // FIXME: we probably shouldn't use floats at all. volatile float hweight = Hint[hint] += weight; - if (TargetRegisterInfo::isPhysicalRegister(hint)) { - if (hweight > bestPhys && mri.isAllocatable(hint)) { - bestPhys = hweight; - hintPhys = hint; - } - } else { - if (hweight > bestVirt) { - bestVirt = hweight; - hintVirt = hint; - } - } + if (TargetRegisterInfo::isVirtualRegister(hint) || mri.isAllocatable(hint)) + CopyHints.insert(CopyHint(hint, hweight, tri.isPhysicalRegister(hint), + (tri.enableMultipleCopyHints() ? hint : CopyHintOrder++))); } Hint.clear(); - // Always prefer the physreg hint. - if (updateLI) { - if (unsigned hint = hintPhys ? hintPhys : hintVirt) { - mri.setRegAllocationHint(li.reg, 0, hint); - // Weakly boost the spill weight of hinted registers. - totalWeight *= 1.01F; + // Pass all the sorted copy hints to mri. + if (updateLI && CopyHints.size()) { + // Remove a generic hint if previously added by target. + if (TargetHint.first == 0 && TargetHint.second) + mri.clearSimpleHint(li.reg); + + for (auto &Hint : CopyHints) { + if (TargetHint.first != 0 && Hint.Reg == TargetHint.second) + // Don't add again the target-type hint. + continue; + mri.addRegAllocationHint(li.reg, Hint.Reg); + if (!tri.enableMultipleCopyHints()) + break; } + + // Weakly boost the spill weight of hinted registers. + totalWeight *= 1.01F; } // If the live interval was already unspillable, leave it that way. diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 774ea9877a7d1..e911085d0adcc 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -345,16 +345,6 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { unsigned Res = getOrCreateVReg(LI); unsigned Addr = getOrCreateVReg(*LI.getPointerOperand()); - if (LI.getOrdering() != AtomicOrdering::NotAtomic) { - MIRBuilder.buildAtomicLoad( - Res, Addr, - *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()), - Flags, DL->getTypeStoreSize(LI.getType()), - getMemOpAlignment(LI), AAMDNodes(), nullptr, - LI.getSyncScopeID(), LI.getOrdering())); - return true; - } - MIRBuilder.buildLoad( Res, Addr, *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()), @@ -376,17 +366,6 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { unsigned Val = getOrCreateVReg(*SI.getValueOperand()); unsigned Addr = getOrCreateVReg(*SI.getPointerOperand()); - if (SI.getOrdering() != AtomicOrdering::NotAtomic) { - MIRBuilder.buildAtomicStore( - Val, Addr, - *MF->getMachineMemOperand( - MachinePointerInfo(SI.getPointerOperand()), Flags, - DL->getTypeStoreSize(SI.getValueOperand()->getType()), - getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSyncScopeID(), - SI.getOrdering())); - return true; - } - MIRBuilder.buildStore( Val, Addr, *MF->getMachineMemOperand( diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index fbcb14d5252de..62c396e6cdf23 100644 --- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -295,8 +295,6 @@ MachineInstrBuilder MachineIRBuilder::buildLoad(unsigned Res, unsigned Addr, MachineMemOperand &MMO) { assert(MRI->getType(Res).isValid() && "invalid operand type"); assert(MRI->getType(Addr).isPointer() && "invalid operand type"); - assert(MMO.getOrdering() == AtomicOrdering::NotAtomic && - "invalid atomic ordering"); return buildInstr(TargetOpcode::G_LOAD) .addDef(Res) @@ -308,8 +306,6 @@ MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr, MachineMemOperand &MMO) { assert(MRI->getType(Val).isValid() && "invalid operand type"); assert(MRI->getType(Addr).isPointer() && "invalid operand type"); - assert(MMO.getOrdering() == AtomicOrdering::NotAtomic && - "invalid atomic ordering"); return buildInstr(TargetOpcode::G_STORE) .addUse(Val) @@ -317,34 +313,6 @@ MachineInstrBuilder MachineIRBuilder::buildStore(unsigned Val, unsigned Addr, .addMemOperand(&MMO); } -MachineInstrBuilder MachineIRBuilder::buildAtomicLoad(unsigned Res, - unsigned Addr, - MachineMemOperand &MMO) { - assert(MRI->getType(Res).isValid() && "invalid operand type"); - assert(MRI->getType(Addr).isPointer() && "invalid operand type"); - assert(MMO.getOrdering() != AtomicOrdering::NotAtomic && - "invalid atomic ordering"); - - return buildInstr(TargetOpcode::G_ATOMIC_LOAD) - .addDef(Res) - .addUse(Addr) - .addMemOperand(&MMO); -} - -MachineInstrBuilder MachineIRBuilder::buildAtomicStore(unsigned Val, - unsigned Addr, - MachineMemOperand &MMO) { - assert(MRI->getType(Val).isValid() && "invalid operand type"); - assert(MRI->getType(Addr).isPointer() && "invalid operand type"); - assert(MMO.getOrdering() != AtomicOrdering::NotAtomic && - "invalid atomic ordering"); - - return buildInstr(TargetOpcode::G_ATOMIC_STORE) - .addUse(Val) - .addUse(Addr) - .addMemOperand(&MMO); -} - MachineInstrBuilder MachineIRBuilder::buildUAdde(unsigned Res, unsigned CarryOut, unsigned Op0, unsigned Op1, diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp index 572aed8abf401..2aa9d6b816c81 100644 --- a/lib/CodeGen/MachineFrameInfo.cpp +++ b/lib/CodeGen/MachineFrameInfo.cpp @@ -47,12 +47,13 @@ static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, } int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, - bool isSS, const AllocaInst *Alloca, - uint8_t ID) { + bool IsSpillSlot, + const AllocaInst *Alloca, + uint8_t StackID) { assert(Size != 0 && "Cannot allocate zero size stack objects!"); Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, - !isSS, ID)); + Objects.push_back(StackObject(Size, Alignment, 0, false, IsSpillSlot, Alloca, + !IsSpillSlot, StackID)); int Index = (int)Objects.size() - NumFixedObjects - 1; assert(Index >= 0 && "Bad frame index!"); ensureMaxAlignment(Alignment); @@ -78,7 +79,7 @@ int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, } int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, - bool Immutable, bool isAliased) { + bool IsImmutable, bool IsAliased) { assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); // The alignment of the frame index can be determined from its offset from // the incoming frame position. If the frame object is at offset 32 and @@ -86,23 +87,24 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, // object is 16-byte aligned. Note that unlike the non-fixed case, if the // stack needs realignment, we can't assume that the stack will in fact be // aligned. - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ false, - /*Alloca*/ nullptr, isAliased)); + unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.insert(Objects.begin(), + StackObject(Size, Alignment, SPOffset, IsImmutable, + /*isSpillSlot=*/false, /*Alloca=*/nullptr, + IsAliased)); return -++NumFixedObjects; } int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset, - bool Immutable) { - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ true, - /*Alloca*/ nullptr, - /*isAliased*/ false)); + bool IsImmutable) { + unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.insert(Objects.begin(), + StackObject(Size, Alignment, SPOffset, IsImmutable, + /*IsSpillSlot=*/true, /*Alloca=*/nullptr, + /*IsAliased=*/false)); return -++NumFixedObjects; } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ffce8545a149c..bf619c8113340 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -500,6 +500,19 @@ namespace { bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT); + /// Helper function to calculate whether the given Load can have its + /// width reduced to ExtVT. + bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, + EVT &ExtVT, unsigned ShAmt = 0); + + /// Used by BackwardsPropagateMask to find suitable loads. + bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, + ConstantSDNode *Mask, SDNode *&UncombinedNode); + + /// Attempt to propagate a given AND node back to load leaves so that they + /// can be combined into narrow loads. + bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); + /// Helper function for MergeConsecutiveStores which merges the /// component store chains. SDValue getMergeStoreChains(SmallVectorImpl &StoreNodes, @@ -3726,6 +3739,161 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, return true; } +bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, + EVT &ExtVT, unsigned ShAmt) { + // Don't transform one with multiple uses, this would require adding a new + // load. + if (!SDValue(LoadN, 0).hasOneUse()) + return false; + + if (LegalOperations && + !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT)) + return false; + + // Do not generate loads of non-round integer types since these can + // be expensive (and would be wrong if the type is not byte sized). + if (!ExtVT.isRound()) + return false; + + // Don't change the width of a volatile load. + if (LoadN->isVolatile()) + return false; + + // Verify that we are actually reducing a load width here. + if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits()) + return false; + + // For the transform to be legal, the load must produce only two values + // (the value loaded and the chain). Don't transform a pre-increment + // load, for example, which produces an extra value. Otherwise the + // transformation is not equivalent, and the downstream logic to replace + // uses gets things wrong. + if (LoadN->getNumValues() > 2) + return false; + + // If the load that we're shrinking is an extload and we're not just + // discarding the extension we can't simply shrink the load. Bail. + // TODO: It would be possible to merge the extensions in some cases. + if (LoadN->getExtensionType() != ISD::NON_EXTLOAD && + LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) + return false; + + if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT)) + return false; + + // It's not possible to generate a constant of extended or untyped type. + EVT PtrType = LoadN->getOperand(1).getValueType(); + if (PtrType == MVT::Untyped || PtrType.isExtended()) + return false; + + return true; +} + +bool DAGCombiner::SearchForAndLoads(SDNode *N, + SmallPtrSetImpl &Loads, + ConstantSDNode *Mask, + SDNode *&NodeToMask) { + // Recursively search for the operands, looking for loads which can be + // narrowed. + for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { + SDValue Op = N->getOperand(i); + + // Constants should already be fixed up... + if (isa(Op)) + continue; + + if (!Op.hasOneUse() || Op.getValueType().isVector()) + return false; + + switch(Op.getOpcode()) { + case ISD::LOAD: { + auto *Load = cast(Op); + EVT ExtVT; + if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && + isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { + Loads.insert(Load); + continue; + } + return false; + } + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::AssertZext: { + unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + EVT VT = Op.getOpcode() == ISD::AssertZext ? + cast(Op.getOperand(1))->getVT() : + Op.getOperand(0).getValueType(); + + // We can accept extending nodes if the mask is wider or an equal + // width to the original type. + if (ExtVT.bitsGE(VT)) + continue; + break; + } + case ISD::OR: + case ISD::XOR: + case ISD::AND: + if (!SearchForAndLoads(Op.getNode(), Loads, Mask, NodeToMask)) + return false; + continue; + } + + // Allow one node which will masked along with any loads found. + if (NodeToMask) + return false; + NodeToMask = Op.getNode(); + } + return true; +} + +bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { + auto *Mask = dyn_cast(N->getOperand(1)); + if (!Mask) + return false; + + if (!Mask->getAPIntValue().isMask()) + return false; + + // No need to do anything if the and directly uses a load. + if (isa(N->getOperand(0))) + return false; + + SmallPtrSet Loads; + SDNode *FixupNode = nullptr; + if (SearchForAndLoads(N, Loads, Mask, FixupNode)) { + if (Loads.size() == 0) + return false; + + SDValue MaskOp = N->getOperand(1); + + // If it exists, fixup the single node we allow in the tree that needs + // masking. + if (FixupNode) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), + FixupNode->getValueType(0), + SDValue(FixupNode, 0), MaskOp); + DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); + DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), + MaskOp); + } + + for (auto *Load : Loads) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), + SDValue(Load, 0), MaskOp); + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); + DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp); + SDValue NewLoad = ReduceLoadWidth(And.getNode()); + assert(NewLoad && + "Shouldn't be masking the load if it can't be narrowed"); + CombineTo(Load, NewLoad, NewLoad.getValue(1)); + } + DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); + return true; + } + return false; +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3927,6 +4095,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } + if (Level >= AfterLegalizeTypes) { + // Attempt to propagate the AND back up to the leaves which, if they're + // loads, can be combined to narrow loads and the AND node can be removed. + // Perform after legalization so that extend nodes will already be + // combined into the loads. + if (BackwardsPropagateMask(N, DAG)) { + return SDValue(N, 0); + } + } + if (SDValue Combined = visitANDLike(N0, N1, N)) return Combined; @@ -8030,20 +8208,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ExtType = ISD::ZEXTLOAD; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); } - if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT)) - return SDValue(); - - unsigned EVTBits = ExtVT.getSizeInBits(); - - // Do not generate loads of non-round integer types since these can - // be expensive (and would be wrong if the type is not byte sized). - if (!ExtVT.isRound()) - return SDValue(); unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShAmt = N01->getZExtValue(); + unsigned EVTBits = ExtVT.getSizeInBits(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); @@ -8080,42 +8250,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { } } - // If we haven't found a load, we can't narrow it. Don't transform one with - // multiple uses, this would require adding a new load. - if (!isa(N0) || !N0.hasOneUse()) + // If we haven't found a load, we can't narrow it. + if (!isa(N0)) return SDValue(); - // Don't change the width of a volatile load. LoadSDNode *LN0 = cast(N0); - if (LN0->isVolatile()) - return SDValue(); - - // Verify that we are actually reducing a load width here. - if (LN0->getMemoryVT().getSizeInBits() < EVTBits) - return SDValue(); - - // For the transform to be legal, the load must produce only two values - // (the value loaded and the chain). Don't transform a pre-increment - // load, for example, which produces an extra value. Otherwise the - // transformation is not equivalent, and the downstream logic to replace - // uses gets things wrong. - if (LN0->getNumValues() > 2) - return SDValue(); - - // If the load that we're shrinking is an extload and we're not just - // discarding the extension we can't simply shrink the load. Bail. - // TODO: It would be possible to merge the extensions in some cases. - if (LN0->getExtensionType() != ISD::NON_EXTLOAD && - LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) - return SDValue(); - - if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT)) - return SDValue(); - - EVT PtrType = N0.getOperand(1).getValueType(); - - if (PtrType == MVT::Untyped || PtrType.isExtended()) - // It's not possible to generate a constant of extended or untyped type. + if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt)) return SDValue(); // For big endian targets, we need to adjust the offset to the pointer to @@ -8126,6 +8266,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ShAmt = LVTStoreBits - EVTStoreBits - ShAmt; } + EVT PtrType = N0.getOperand(1).getValueType(); uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); @@ -8604,6 +8745,13 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { LoadSDNode *LD1 = dyn_cast(getBuildPairElt(N, 0)); LoadSDNode *LD2 = dyn_cast(getBuildPairElt(N, 1)); + + // A BUILD_PAIR is always having the least significant part in elt 0 and the + // most significant part in elt 1. So when combining into one large load, we + // need to consider the endianness. + if (DAG.getDataLayout().isBigEndian()) + std::swap(LD1, LD2); + if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || LD1->getAddressSpace() != LD2->getAddressSpace()) return SDValue(); diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index d86ed0b6f8a52..d232641303de4 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -573,8 +573,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); - // Promote all the way up to the canonical SetCC type. - Mask = PromoteTargetBoolean(Mask, LHS.getValueType()); return DAG.getNode(ISD::VSELECT, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS); } @@ -1209,24 +1207,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, // When the data operand has illegal type, we should legalize the data // operand first. The mask will be promoted/splitted/widened according to // the data operand type. - if (TLI.isTypeLegal(DataVT)) + if (TLI.isTypeLegal(DataVT)) { Mask = PromoteTargetBoolean(Mask, DataVT); - else { - if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) - return PromoteIntOp_MSTORE(N, 3); - - else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector) - return WidenVecOp_MSTORE(N, 3); - - else { - assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector); - return SplitVecOp_MSTORE(N, 3); - } + // Update in place. + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask; + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } + + if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) + return PromoteIntOp_MSTORE(N, 3); + if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector) + return WidenVecOp_MSTORE(N, 3); + assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector); + return SplitVecOp_MSTORE(N, 3); } else { // Data operand assert(OpNo == 3 && "Unexpected operand for promotion"); DataOp = GetPromotedInteger(DataOp); - Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); TruncateStore = true; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 88c5dddfec449..2de545654af78 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1147,23 +1147,6 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { return DAG.getNode(ExtendCode, dl, BoolVT, Bool); } -/// Widen the given target boolean to a target boolean of the given type. -/// The boolean vector is widened and then promoted to match the target boolean -/// type of the given ValVT. -SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT, - bool WithZeroes) { - SDLoc dl(Bool); - EVT BoolVT = Bool.getValueType(); - - assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() && - TLI.isTypeLegal(ValVT) && - "Unexpected types in WidenTargetBoolean"); - EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(), - ValVT.getVectorNumElements()); - Bool = ModifyToType(Bool, WideVT, WithZeroes); - return PromoteTargetBoolean(Bool, ValVT); -} - /// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi. void DAGTypeLegalizer::SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index c221cb30299a9..64cb80e0d8538 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -183,10 +183,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT); - /// Modify Bit Vector to match SetCC result type of ValVT. - /// The bit vector is widened with zeroes when WithZeroes is true. - SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false); - void ReplaceValueWith(SDValue From, SDValue To); void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, @@ -623,7 +619,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecRes_SETCC(SDNode *N); SDValue ScalarizeVecRes_UNDEF(SDNode *N); SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); - SDValue ScalarizeVecRes_VSETCC(SDNode *N); // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); @@ -732,7 +727,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_SETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N); - SDValue WidenVecRes_VSETCC(SDNode* N); SDValue WidenVecRes_Ternary(SDNode *N); SDValue WidenVecRes_Binary(SDNode *N); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 9ab82382c6590..bded9817b0667 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1885,9 +1885,6 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, else std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); - MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType()); - MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType()); - // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = @@ -2916,25 +2913,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { ISD::LoadExtType ExtType = N->getExtensionType(); SDLoc dl(N); - if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - EVT BoolVT = getSetCCResultType(WidenVT); - - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; - - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - } + // The mask should be widened as well + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), + WidenVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), @@ -2950,12 +2933,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Mask = N->getMask(); + EVT MaskVT = Mask.getValueType(); SDValue Src0 = GetWidenedVector(N->getValue()); unsigned NumElts = WideVT.getVectorNumElements(); SDLoc dl(N); // The mask should be widened as well - Mask = WidenTargetBoolean(Mask, WideVT, true); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), + WideVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); // Widen the Index operand SDValue Index = N->getIndex(); @@ -3239,19 +3226,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) { N->getOperand(1), InOp1, InOp2, N->getOperand(4)); } -SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) { - assert(N->getValueType(0).isVector() == - N->getOperand(0).getValueType().isVector() && - "Scalar/Vector type mismatch"); - if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N); - - EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - SDValue InOp1 = GetWidenedVector(N->getOperand(0)); - SDValue InOp2 = GetWidenedVector(N->getOperand(1)); - return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT, - InOp1, InOp2, N->getOperand(2)); -} - SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getUNDEF(WidenVT); @@ -3282,7 +3256,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) { return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask); } -SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && "Operands must be vectors"); @@ -3559,6 +3533,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { } SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { + assert(OpNo == 3 && "Can widen only data operand of mstore"); MaskedStoreSDNode *MST = cast(N); SDValue Mask = MST->getMask(); EVT MaskVT = Mask.getValueType(); @@ -3567,25 +3542,13 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { SDValue WideVal = GetWidenedVector(StVal); SDLoc dl(N); - if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - // The mask should be widened as well. - EVT BoolVT = getSetCCResultType(WideVal.getValueType()); - // We can't use ModifyToType() because we should fill the mask with - // zeroes. - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; + // The mask should be widened as well. + EVT WideVT = WideVal.getValueType(); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), + WideVT.getVectorNumElements()); + Mask = ModifyToType(Mask, WideMaskVT, true); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); - } assert(Mask.getValueType().getVectorNumElements() == WideVal.getValueType().getVectorNumElements() && "Mask and data vectors should have the same number of elements"); @@ -3599,15 +3562,18 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { MaskedScatterSDNode *MSC = cast(N); SDValue DataOp = MSC->getValue(); SDValue Mask = MSC->getMask(); + EVT MaskVT = Mask.getValueType(); // Widen the value. SDValue WideVal = GetWidenedVector(DataOp); EVT WideVT = WideVal.getValueType(); - unsigned NumElts = WideVal.getValueType().getVectorNumElements(); + unsigned NumElts = WideVT.getVectorNumElements(); SDLoc dl(N); // The mask should be widened as well. - Mask = WidenTargetBoolean(Mask, WideVT, true); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), + MaskVT.getVectorElementType(), NumElts); + Mask = ModifyToType(Mask, WideMaskVT, true); // Widen index. SDValue Index = MSC->getIndex(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1c31eca3ec916..f3addf05566b6 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2148,11 +2148,14 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext())); // Generate code to load the content of the guard slot. - SDValue StackSlot = DAG.getLoad( + SDValue GuardVal = DAG.getLoad( PtrTy, dl, DAG.getEntryNode(), StackSlotPtr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align, MachineMemOperand::MOVolatile); + if (TLI.useStackGuardXorFP()) + GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl); + // Retrieve guard check function, nullptr if instrumentation is inlined. if (const Value *GuardCheck = TLI.getSSPStackGuardCheck(M)) { // The target provides a guard check function to validate the guard value. @@ -2164,7 +2167,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - Entry.Node = StackSlot; + Entry.Node = GuardVal; Entry.Ty = FnTy->getParamType(0); if (Fn->hasAttribute(1, Attribute::AttrKind::InReg)) Entry.IsInReg = true; @@ -2197,7 +2200,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, // Perform the comparison via a subtract/getsetcc. EVT VT = Guard.getValueType(); - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, StackSlot); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, GuardVal); SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), @@ -2207,7 +2210,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, // If the sub is not 0, then we know the guard/stackslot do not equal, so // branch to failure MBB. SDValue BrCond = DAG.getNode(ISD::BRCOND, dl, - MVT::Other, StackSlot.getOperand(0), + MVT::Other, GuardVal.getOperand(0), Cmp, DAG.getBasicBlock(SPD.getFailureMBB())); // Otherwise branch to success MBB. SDValue Br = DAG.getNode(ISD::BR, dl, @@ -5646,6 +5649,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { MachinePointerInfo(Global, 0), Align, MachineMemOperand::MOVolatile); } + if (TLI.useStackGuardXorFP()) + Res = TLI.emitStackGuardXorFP(DAG, Res, sdl); DAG.setRoot(Chain); setValue(&I, Res); return nullptr; diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp index e33400288639a..62cef95a4af23 100644 --- a/lib/CodeGen/StackProtector.cpp +++ b/lib/CodeGen/StackProtector.cpp @@ -385,8 +385,12 @@ static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI, /// - The epilogue checks the value stored in the prologue against the original /// value. It calls __stack_chk_fail if they differ. bool StackProtector::InsertStackProtectors() { + // If the target wants to XOR the frame pointer into the guard value, it's + // impossible to emit the check in IR, so the target *must* support stack + // protection in SDAG. bool SupportsSelectionDAGSP = - EnableSelectionDAGSP && !TM->Options.EnableFastISel; + TLI->useStackGuardXorFP() || + (EnableSelectionDAGSP && !TM->Options.EnableFastISel); AllocaInst *AI = nullptr; // Place on stack that stores the stack guard. for (Function::iterator I = F->begin(), E = F->end(); I != E;) { diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index f4e5583cbe7b1..dfda313f23369 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -373,31 +373,36 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - std::pair Hint = MRI.getRegAllocationHint(VirtReg); - - // Hints with HintType != 0 were set by target-dependent code. - // Such targets must provide their own implementation of - // TRI::getRegAllocationHints to interpret those hint types. - assert(Hint.first == 0 && "Target must implement TRI::getRegAllocationHints"); - - // Target-independent hints are either a physical or a virtual register. - unsigned Phys = Hint.second; - if (VRM && isVirtualRegister(Phys)) - Phys = VRM->getPhys(Phys); - - // Check that Phys is a valid hint in VirtReg's register class. - if (!isPhysicalRegister(Phys)) - return false; - if (MRI.isReserved(Phys)) - return false; - // Check that Phys is in the allocation order. We shouldn't heed hints - // from VirtReg's register class if they aren't in the allocation order. The - // target probably has a reason for removing the register. - if (!is_contained(Order, Phys)) - return false; - - // All clear, tell the register allocator to prefer this register. - Hints.push_back(Phys); + const std::pair> &Hints_MRI = + MRI.getRegAllocationHints(VirtReg); + + // First hint may be a target hint. + bool Skip = (Hints_MRI.first != 0); + for (auto Reg : Hints_MRI.second) { + if (Skip) { + Skip = false; + continue; + } + + // Target-independent hints are either a physical or a virtual register. + unsigned Phys = Reg; + if (VRM && isVirtualRegister(Phys)) + Phys = VRM->getPhys(Phys); + + // Check that Phys is a valid hint in VirtReg's register class. + if (!isPhysicalRegister(Phys)) + continue; + if (MRI.isReserved(Phys)) + continue; + // Check that Phys is in the allocation order. We shouldn't heed hints + // from VirtReg's register class if they aren't in the allocation order. The + // target probably has a reason for removing the register. + if (!is_contained(Order, Phys)) + continue; + + // All clear, tell the register allocator to prefer this register. + Hints.push_back(Phys); + } return false; } diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 28a4375cb28b6..c51340766b798 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1661,6 +1661,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { else AA = nullptr; OptLevel = TM.getOptLevel(); + // Disable optimizations if requested. We cannot skip the whole pass as some + // fixups are necessary for correctness. + if (skipFunction(*Func.getFunction())) + OptLevel = CodeGenOpt::None; bool MadeChange = false; diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt index 7d18c98bdc300..c3d79c0d6e6d0 100644 --- a/lib/DebugInfo/CodeView/CMakeLists.txt +++ b/lib/DebugInfo/CodeView/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_library(LLVMDebugInfoCodeView TypeDumpVisitor.cpp TypeIndex.cpp TypeIndexDiscovery.cpp + TypeHashing.cpp TypeRecordMapping.cpp TypeStreamMerger.cpp TypeTableCollection.cpp diff --git a/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp b/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp index 514d55aed0b2e..8aee4aa2e2aee 100644 --- a/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp +++ b/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp @@ -28,27 +28,6 @@ using namespace llvm; using namespace llvm::codeview; -static HashedType Empty{0, {}, TypeIndex::None()}; -static HashedType Tombstone{hash_code(-1), {}, TypeIndex::None()}; - -namespace llvm { - -template <> struct DenseMapInfo { - static inline HashedType getEmptyKey() { return Empty; } - - static inline HashedType getTombstoneKey() { return Tombstone; } - - static unsigned getHashValue(HashedType Val) { return Val.Hash; } - - static bool isEqual(HashedType LHS, HashedType RHS) { - if (RHS.Hash != LHS.Hash) - return false; - return RHS.Data == LHS.Data; - } -}; - -} // end namespace llvm - TypeIndex MergingTypeTableBuilder::nextTypeIndex() const { return TypeIndex::fromArrayIndex(SeenRecords.size()); } @@ -56,7 +35,6 @@ TypeIndex MergingTypeTableBuilder::nextTypeIndex() const { MergingTypeTableBuilder::MergingTypeTableBuilder(BumpPtrAllocator &Storage) : RecordStorage(Storage) { SeenRecords.reserve(4096); - SeenHashes.reserve(4096); } MergingTypeTableBuilder::~MergingTypeTableBuilder() = default; @@ -102,13 +80,8 @@ ArrayRef> MergingTypeTableBuilder::records() const { return SeenRecords; } -ArrayRef MergingTypeTableBuilder::hashes() const { - return SeenHashes; -} - void MergingTypeTableBuilder::reset() { HashedRecords.clear(); - SeenHashes.clear(); SeenRecords.clear(); } @@ -124,18 +97,19 @@ TypeIndex MergingTypeTableBuilder::insertRecordAs(hash_code Hash, assert(Record.size() < UINT32_MAX && "Record too big"); assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!"); - HashedType TempHashedType = {Hash, Record, nextTypeIndex()}; - auto Result = HashedRecords.insert(TempHashedType); + LocallyHashedType WeakHash{Hash, Record}; + auto Result = HashedRecords.try_emplace(WeakHash, nextTypeIndex()); if (Result.second) { - Result.first->Data = stabilize(RecordStorage, Record); - SeenRecords.push_back(Result.first->Data); - SeenHashes.push_back(Result.first->Hash); + ArrayRef RecordData = stabilize(RecordStorage, Record); + Result.first->first.RecordData = RecordData; + SeenRecords.push_back(RecordData); } // Update the caller's copy of Record to point a stable copy. - Record = Result.first->Data; - return Result.first->Index; + TypeIndex ActualTI = Result.first->second; + Record = SeenRecords[ActualTI.toArrayIndex()]; + return ActualTI; } TypeIndex diff --git a/lib/DebugInfo/CodeView/TypeHashing.cpp b/lib/DebugInfo/CodeView/TypeHashing.cpp new file mode 100644 index 0000000000000..57570917e1d89 --- /dev/null +++ b/lib/DebugInfo/CodeView/TypeHashing.cpp @@ -0,0 +1,74 @@ +//===- TypeHashing.cpp -------------------------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/TypeHashing.h" + +#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" +#include "llvm/Support/SHA1.h" + +using namespace llvm; +using namespace llvm::codeview; + +LocallyHashedType DenseMapInfo::Empty{0, {}}; +LocallyHashedType DenseMapInfo::Tombstone{hash_code(-1), {}}; + +static std::array EmptyHash; +static std::array TombstoneHash = { + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}; + +GloballyHashedType DenseMapInfo::Empty{EmptyHash}; +GloballyHashedType DenseMapInfo::Tombstone{TombstoneHash}; + +LocallyHashedType LocallyHashedType::hashType(ArrayRef RecordData) { + return {llvm::hash_value(RecordData), RecordData}; +} + +GloballyHashedType +GloballyHashedType::hashType(ArrayRef RecordData, + ArrayRef PreviousTypes, + ArrayRef PreviousIds) { + SmallVector Refs; + discoverTypeIndices(RecordData, Refs); + SHA1 S; + S.init(); + uint32_t Off = 0; + RecordData = RecordData.drop_front(sizeof(RecordPrefix)); + for (const auto &Ref : Refs) { + // Hash any data that comes before this TiRef. + uint32_t PreLen = Ref.Offset - Off; + ArrayRef PreData = RecordData.slice(Off, PreLen); + S.update(PreData); + auto Prev = (Ref.Kind == TiRefKind::IndexRef) ? PreviousIds : PreviousTypes; + + auto RefData = RecordData.slice(Ref.Offset, Ref.Count * sizeof(TypeIndex)); + // For each type index referenced, add in the previously computed hash + // value of that type. + ArrayRef Indices( + reinterpret_cast(RefData.data()), Ref.Count); + for (TypeIndex TI : Indices) { + ArrayRef BytesToHash; + if (TI.isSimple() || TI.isNoneType()) { + const uint8_t *IndexBytes = reinterpret_cast(&TI); + BytesToHash = makeArrayRef(IndexBytes, sizeof(TypeIndex)); + } else { + BytesToHash = Prev[TI.toArrayIndex()].Hash; + } + S.update(BytesToHash); + } + + Off = Ref.Offset + Ref.Count * sizeof(TypeIndex); + } + + // Don't forget to add in any trailing bytes. + auto TrailingBytes = RecordData.drop_front(Off); + S.update(TrailingBytes); + + return {S.final()}; +} diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index afbe4eb9543d5..76af9a16f0cb7 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -78,6 +78,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 + Name.startswith("avx512.kunpck") || //added in 6.0 Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.broadcastm") || // Added in 6.0 @@ -1065,6 +1066,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (IsX86 && (Name.startswith("avx512.kunpck"))) { + uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2; + uint64_t And = (1 << Shift) - 1; + Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And); + Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift); + Rep = Builder.CreateOr(LowBits, HighBits); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp index 4bd17257016d7..77ecd7a907713 100644 --- a/lib/IR/ConstantRange.cpp +++ b/lib/IR/ConstantRange.cpp @@ -199,39 +199,63 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp, "NoWrapKind invalid!"); unsigned BitWidth = Other.getBitWidth(); - if (BinOp != Instruction::Add) + ConstantRange Result(BitWidth); + + switch (BinOp) { + default: // Conservative answer: empty set return ConstantRange(BitWidth, false); - if (auto *C = Other.getSingleElement()) - if (C->isNullValue()) - // Full set: nothing signed / unsigned wraps when added to 0. - return ConstantRange(BitWidth); - - ConstantRange Result(BitWidth); + case Instruction::Add: + if (auto *C = Other.getSingleElement()) + if (C->isNullValue()) + // Full set: nothing signed / unsigned wraps when added to 0. + return ConstantRange(BitWidth); + if (NoWrapKind & OBO::NoUnsignedWrap) + Result = + SubsetIntersect(Result, ConstantRange(APInt::getNullValue(BitWidth), + -Other.getUnsignedMax())); + if (NoWrapKind & OBO::NoSignedWrap) { + const APInt &SignedMin = Other.getSignedMin(); + const APInt &SignedMax = Other.getSignedMax(); + if (SignedMax.isStrictlyPositive()) + Result = SubsetIntersect( + Result, + ConstantRange(APInt::getSignedMinValue(BitWidth), + APInt::getSignedMinValue(BitWidth) - SignedMax)); + if (SignedMin.isNegative()) + Result = SubsetIntersect( + Result, + ConstantRange(APInt::getSignedMinValue(BitWidth) - SignedMin, + APInt::getSignedMinValue(BitWidth))); + } + return Result; - if (NoWrapKind & OBO::NoUnsignedWrap) - Result = - SubsetIntersect(Result, ConstantRange(APInt::getNullValue(BitWidth), - -Other.getUnsignedMax())); - - if (NoWrapKind & OBO::NoSignedWrap) { - const APInt &SignedMin = Other.getSignedMin(); - const APInt &SignedMax = Other.getSignedMax(); - - if (SignedMax.isStrictlyPositive()) - Result = SubsetIntersect( - Result, - ConstantRange(APInt::getSignedMinValue(BitWidth), - APInt::getSignedMinValue(BitWidth) - SignedMax)); - - if (SignedMin.isNegative()) - Result = SubsetIntersect( - Result, ConstantRange(APInt::getSignedMinValue(BitWidth) - SignedMin, - APInt::getSignedMinValue(BitWidth))); + case Instruction::Sub: + if (auto *C = Other.getSingleElement()) + if (C->isNullValue()) + // Full set: nothing signed / unsigned wraps when subtracting 0. + return ConstantRange(BitWidth); + if (NoWrapKind & OBO::NoUnsignedWrap) + Result = + SubsetIntersect(Result, ConstantRange(Other.getUnsignedMax(), + APInt::getMinValue(BitWidth))); + if (NoWrapKind & OBO::NoSignedWrap) { + const APInt &SignedMin = Other.getSignedMin(); + const APInt &SignedMax = Other.getSignedMax(); + if (SignedMax.isStrictlyPositive()) + Result = SubsetIntersect( + Result, + ConstantRange(APInt::getSignedMinValue(BitWidth) + SignedMax, + APInt::getSignedMinValue(BitWidth))); + if (SignedMin.isNegative()) + Result = SubsetIntersect( + Result, + ConstantRange(APInt::getSignedMinValue(BitWidth), + APInt::getSignedMinValue(BitWidth) + SignedMin)); + } + return Result; } - - return Result; } bool ConstantRange::isFullSet() const { diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp index 02382afb8c49a..4aac10a27aa49 100644 --- a/lib/IR/SafepointIRVerifier.cpp +++ b/lib/IR/SafepointIRVerifier.cpp @@ -32,6 +32,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/IR/BasicBlock.h" @@ -168,7 +169,7 @@ static void GatherDominatingDefs(const BasicBlock *BB, const auto &Defs = BlockMap[DTN->getBlock()]->Contribution; Result.insert(Defs.begin(), Defs.end()); // If this block is 'Cleared', then nothing LiveIn to this block can be - // available after this block completes. Note: This turns out to be + // available after this block completes. Note: This turns out to be // really important for reducing memory consuption of the initial available // sets and thus peak memory usage by this verifier. if (BlockMap[DTN->getBlock()]->Cleared) @@ -190,23 +191,21 @@ static void TransferInstruction(const Instruction &I, bool &Cleared, Available.insert(&I); } -/// Compute the AvailableOut set for BB, based on the -/// BasicBlockState BBS, which is the BasicBlockState for BB. FirstPass is set -/// when the verifier runs for the first time computing the AvailableOut set -/// for BB. -static void TransferBlock(const BasicBlock *BB, - BasicBlockState &BBS, bool FirstPass) { +/// Compute the AvailableOut set for BB, based on the BasicBlockState BBS, +/// which is the BasicBlockState for BB. +/// ContributionChanged is set when the verifier runs for the first time +/// (in this case Contribution was changed from 'empty' to its initial state) or +/// when Contribution of this BB was changed since last computation. +static void TransferBlock(const BasicBlock *BB, BasicBlockState &BBS, + bool ContributionChanged) { - const DenseSet &AvailableIn = BBS.AvailableIn; + const DenseSet &AvailableIn = BBS.AvailableIn; DenseSet &AvailableOut = BBS.AvailableOut; if (BBS.Cleared) { - // AvailableOut does not change no matter how the input changes, just - // leave it be. We need to force this calculation the first time so that - // we have a AvailableOut at all. - if (FirstPass) { + // AvailableOut will change only when Contribution changed. + if (ContributionChanged) AvailableOut = BBS.Contribution; - } } else { // Otherwise, we need to reduce the AvailableOut set by things which are no // longer in our AvailableIn @@ -293,32 +292,37 @@ static enum BaseType getBaseType(const Value *Val) { : BaseType::ExclusivelySomeConstant; } -static void Verify(const Function &F, const DominatorTree &DT) { - SpecificBumpPtrAllocator BSAllocator; - DenseMap BlockMap; - - DEBUG(dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n"); - if (PrintOnly) - dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n"; - - - for (const BasicBlock &BB : F) { - BasicBlockState *BBS = new(BSAllocator.Allocate()) BasicBlockState; - for (const auto &I : BB) - TransferInstruction(I, BBS->Cleared, BBS->Contribution); - BlockMap[&BB] = BBS; - } - - for (auto &BBI : BlockMap) { - GatherDominatingDefs(BBI.first, BBI.second->AvailableIn, DT, BlockMap); - TransferBlock(BBI.first, *BBI.second, true); - } +static bool isNotExclusivelyConstantDerived(const Value *V) { + return getBaseType(V) == BaseType::NonConstant; +} +using BlockStateMap = DenseMap; + +/// This function iterates over all BBs from BlockMap and recalculates +/// AvailableIn/Out for each of them until it converges. +/// It calls Visitor for each visited BB after updating it's AvailableIn. +/// BBContributionUpdater may change BB's Contribution and should return true in +/// this case. +/// +/// BBContributionUpdater is expected to have following signature: +/// (const BasicBlock *BB, const BasicBlockState *BBS, +/// DenseSet &Contribution) -> bool +/// FIXME: type of BBContributionUpdater is a template parameter because it +/// might be a lambda with arbitrary non-empty capture list. It's a bit ugly and +/// unclear, but other options causes us to spread the logic of +/// RecalculateBBStates across the rest of the algorithm. The solution is to +/// move this function, TransferBlock, TransferInstruction and others to a +/// separate class which will hold all the logic related to BlockStateMap. +template +static void RecalculateBBsStates(BlockStateMap &BlockMap, + VisitorTy &&BBContributionUpdater) { SetVector Worklist; + // TODO: This order is suboptimal, it's better to replace it with priority + // queue where priority is RPO number of BB. for (auto &BBI : BlockMap) Worklist.insert(BBI.first); - // This loop iterates the AvailableIn and AvailableOut sets to a fixed point. + // This loop iterates the AvailableIn/Out sets until it converges. // The AvailableIn and AvailableOut sets decrease as we iterate. while (!Worklist.empty()) { const BasicBlock *BB = Worklist.pop_back_val(); @@ -328,18 +332,49 @@ static void Verify(const Function &F, const DominatorTree &DT) { for (const BasicBlock *PBB : predecessors(BB)) set_intersect(BBS->AvailableIn, BlockMap[PBB]->AvailableOut); - if (OldInCount == BBS->AvailableIn.size()) - continue; + assert(OldInCount >= BBS->AvailableIn.size() && "invariant!"); - assert(OldInCount > BBS->AvailableIn.size() && "invariant!"); + bool InputsChanged = OldInCount != BBS->AvailableIn.size(); + bool ContributionChanged = + BBContributionUpdater(BB, BBS, BBS->Contribution); + if (!InputsChanged && !ContributionChanged) + continue; size_t OldOutCount = BBS->AvailableOut.size(); - TransferBlock(BB, *BBS, false); + TransferBlock(BB, *BBS, ContributionChanged); if (OldOutCount != BBS->AvailableOut.size()) { assert(OldOutCount > BBS->AvailableOut.size() && "invariant!"); Worklist.insert(succ_begin(BB), succ_end(BB)); } } +} + +static void Verify(const Function &F, const DominatorTree &DT) { + SpecificBumpPtrAllocator BSAllocator; + BlockStateMap BlockMap; + + DEBUG(dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n"); + if (PrintOnly) + dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n"; + + + for (const BasicBlock &BB : F) { + BasicBlockState *BBS = new(BSAllocator.Allocate()) BasicBlockState; + for (const auto &I : BB) + TransferInstruction(I, BBS->Cleared, BBS->Contribution); + BlockMap[&BB] = BBS; + } + + for (auto &BBI : BlockMap) { + GatherDominatingDefs(BBI.first, BBI.second->AvailableIn, DT, BlockMap); + TransferBlock(BBI.first, *BBI.second, true); + } + + RecalculateBBsStates(BlockMap, [] (const BasicBlock *, + const BasicBlockState *, + DenseSet &) { + return false; + }); // We now have all the information we need to decide if the use of a heap // reference is legal or not, given our safepoint semantics. @@ -356,16 +391,58 @@ static void Verify(const Function &F, const DominatorTree &DT) { AnyInvalidUses = true; }; - auto isNotExclusivelyConstantDerived = [](const Value *V) { - return getBaseType(V) == BaseType::NonConstant; - }; + // This set contains defs that can be safely ignored during verification. + DenseSet ValidUnrelocatedDefs; + + // Now we can remove all valid unrelocated gc pointer defs from all BBS sets. + RecalculateBBsStates(BlockMap, [&ValidUnrelocatedDefs]( + const BasicBlock *BB, + const BasicBlockState *BBS, + DenseSet &Contribution) { + DenseSet AvailableSet = BBS->AvailableIn; + bool ContributionChanged = false; + for (const Instruction &I : *BB) { + bool ProducesUnrelocatedPointer = false; + if ((isa(I) || isa(I)) && + containsGCPtrType(I.getType())) { + // GEP/bitcast of unrelocated pointer is legal by itself but this + // def shouldn't appear in any AvailableSet. + for (const Value *V : I.operands()) + if (containsGCPtrType(V->getType()) && + isNotExclusivelyConstantDerived(V) && !AvailableSet.count(V)) { + ProducesUnrelocatedPointer = true; + break; + } + } + if (!ProducesUnrelocatedPointer) { + bool Cleared = false; + TransferInstruction(I, Cleared, AvailableSet); + (void)Cleared; + } else { + // Remove def of unrelocated pointer from Contribution of this BB + // and trigger update of all its successors. + Contribution.erase(&I); + ValidUnrelocatedDefs.insert(&I); + DEBUG(dbgs() << "Removing " << I << " from Contribution of " + << BB->getName() << "\n"); + ContributionChanged = true; + } + } + return ContributionChanged; + }); - for (const BasicBlock &BB : F) { + // We need RPO here to a) report always the first error b) report errors in + // same order from run to run. + ReversePostOrderTraversal RPOT(&F); + for (const BasicBlock *BB : RPOT) { + BasicBlockState *BBS = BlockMap[BB]; // We destructively modify AvailableIn as we traverse the block instruction // by instruction. - DenseSet &AvailableSet = BlockMap[&BB]->AvailableIn; - for (const Instruction &I : BB) { - if (const PHINode *PN = dyn_cast(&I)) { + DenseSet &AvailableSet = BBS->AvailableIn; + for (const Instruction &I : *BB) { + if (ValidUnrelocatedDefs.count(&I)) { + continue; // This instruction shouldn't be added to AvailableSet. + } else if (const PHINode *PN = dyn_cast(&I)) { if (containsGCPtrType(PN->getType())) for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { const BasicBlock *InBB = PN->getIncomingBlock(i); diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp index 748036e8a2577..32e7cd4f90979 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -257,6 +257,66 @@ static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) { OS.EmitValue(ABS, Size); } +static void +emitV2FileDirTables(MCStreamer *MCOS, + const SmallVectorImpl &MCDwarfDirs, + const SmallVectorImpl &MCDwarfFiles) { + // First the directory table. + for (auto Dir : MCDwarfDirs) { + MCOS->EmitBytes(Dir); // The DirectoryName, and... + MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator. + } + MCOS->EmitIntValue(0, 1); // Terminate the directory list. + + // Second the file table. + for (unsigned i = 1; i < MCDwarfFiles.size(); i++) { + assert(!MCDwarfFiles[i].Name.empty()); + MCOS->EmitBytes(MCDwarfFiles[i].Name); // FileName and... + MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator. + MCOS->EmitULEB128IntValue(MCDwarfFiles[i].DirIndex); // Directory number. + MCOS->EmitIntValue(0, 1); // Last modification timestamp (always 0). + MCOS->EmitIntValue(0, 1); // File size (always 0). + } + MCOS->EmitIntValue(0, 1); // Terminate the file list. +} + +static void +emitV5FileDirTables(MCStreamer *MCOS, + const SmallVectorImpl &MCDwarfDirs, + const SmallVectorImpl &MCDwarfFiles, + StringRef CompilationDir) { + // The directory format, which is just inline null-terminated strings. + MCOS->EmitIntValue(1, 1); + MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path); + MCOS->EmitULEB128IntValue(dwarf::DW_FORM_string); + // Then the list of directory paths. CompilationDir comes first. + MCOS->EmitULEB128IntValue(MCDwarfDirs.size() + 1); + MCOS->EmitBytes(CompilationDir); + MCOS->EmitBytes(StringRef("\0", 1)); + for (auto Dir : MCDwarfDirs) { + MCOS->EmitBytes(Dir); // The DirectoryName, and... + MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator. + } + + // The file format, which is the inline null-terminated filename and a + // directory index. We don't track file size/timestamp so don't emit them + // in the v5 table. + // FIXME: Arrange to emit MD5 signatures for the source files. + MCOS->EmitIntValue(2, 1); + MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path); + MCOS->EmitULEB128IntValue(dwarf::DW_FORM_string); + MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_directory_index); + MCOS->EmitULEB128IntValue(dwarf::DW_FORM_udata); + // Then the list of file names. These start at 1 for some reason. + MCOS->EmitULEB128IntValue(MCDwarfFiles.size() - 1); + for (unsigned i = 1; i < MCDwarfFiles.size(); ++i) { + assert(!MCDwarfFiles[i].Name.empty()); + MCOS->EmitBytes(MCDwarfFiles[i].Name); // FileName and... + MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator. + MCOS->EmitULEB128IntValue(MCDwarfFiles[i].DirIndex); // Directory number. + } +} + std::pair MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, ArrayRef StandardOpcodeLengths) const { @@ -279,29 +339,35 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, // Next 2 bytes is the Version. unsigned LineTableVersion = context.getDwarfVersion(); - // FIXME: Right now the compiler doesn't support line table V5. Until it's - // supported keep generating line table V4, when Dwarf Info version V5 is used. - if (LineTableVersion >= 5) - LineTableVersion = 4; MCOS->EmitIntValue(LineTableVersion, 2); + // Keep track of the bytes between the very start and where the header length + // comes out. + unsigned PreHeaderLengthBytes = 4 + 2; + + // In v5, we get address info next. + if (LineTableVersion >= 5) { + MCOS->EmitIntValue(context.getAsmInfo()->getCodePointerSize(), 1); + MCOS->EmitIntValue(0, 1); // Segment selector; same as EmitGenDwarfAranges. + PreHeaderLengthBytes += 2; + } + // Create a symbol for the end of the prologue (to be set when we get there). MCSymbol *ProEndSym = context.createTempSymbol(); // Lprologue_end - // Length of the prologue, is the next 4 bytes. Which is the start of the - // section to the end of the prologue. Not including the 4 bytes for the - // total length, the 2 bytes for the version, and these 4 bytes for the - // length of the prologue. - emitAbsValue( - *MCOS, - MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym, (4 + 2 + 4)), 4); + // Length of the prologue, is the next 4 bytes. This is actually the length + // from after the length word, to the end of the prologue. + emitAbsValue(*MCOS, + MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym, + (PreHeaderLengthBytes + 4)), + 4); // Parameters of the state machine, are next. MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1); // maximum_operations_per_instruction // For non-VLIW architectures this field is always 1. // FIXME: VLIW architectures need to update this field accordingly. - if (context.getDwarfVersion() >= 4) + if (LineTableVersion >= 4) MCOS->EmitIntValue(1, 1); MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1); MCOS->EmitIntValue(Params.DWARF2LineBase, 1); @@ -312,26 +378,12 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, for (char Length : StandardOpcodeLengths) MCOS->EmitIntValue(Length, 1); - // Put out the directory and file tables. - - // First the directory table. - for (unsigned i = 0; i < MCDwarfDirs.size(); i++) { - MCOS->EmitBytes(MCDwarfDirs[i]); // the DirectoryName - MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string - } - MCOS->EmitIntValue(0, 1); // Terminate the directory list - - // Second the file table. - for (unsigned i = 1; i < MCDwarfFiles.size(); i++) { - assert(!MCDwarfFiles[i].Name.empty()); - MCOS->EmitBytes(MCDwarfFiles[i].Name); // FileName - MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string - // the Directory num - MCOS->EmitULEB128IntValue(MCDwarfFiles[i].DirIndex); - MCOS->EmitIntValue(0, 1); // last modification timestamp (always 0) - MCOS->EmitIntValue(0, 1); // filesize (always 0) - } - MCOS->EmitIntValue(0, 1); // Terminate the file list + // Put out the directory and file tables. The formats vary depending on + // the version. + if (LineTableVersion >= 5) + emitV5FileDirTables(MCOS, MCDwarfDirs, MCDwarfFiles, CompilationDir); + else + emitV2FileDirTables(MCOS, MCDwarfDirs, MCDwarfFiles); // This is the end of the prologue, so set the value of the symbol at the // end of the prologue (that was used in a previous expression). diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 42521ac72e2c3..0e93eb835c023 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -115,6 +115,7 @@ struct WasmImport { StringRef FieldName; unsigned Kind; int32_t Type; + bool IsMutable; }; // A wasm function to be written into the function section. @@ -287,7 +288,7 @@ class WasmObjectWriter : public MCObjectWriter { void writeLinkingMetaDataSection( ArrayRef Segments, uint32_t DataSize, SmallVector, 4> SymbolFlags, - bool HasStackPointer, uint32_t StackPointerGlobal); + Optional StackPointerGlobal); uint32_t getProvisionalValue(const WasmRelocationEntry &RelEntry); void applyRelocations(ArrayRef Relocations, @@ -681,7 +682,7 @@ void WasmObjectWriter::writeImportSection(ArrayRef Imports) { break; case wasm::WASM_EXTERNAL_GLOBAL: encodeSLEB128(int32_t(Import.Type), getStream()); - encodeULEB128(0, getStream()); // mutability + encodeULEB128(int32_t(Import.IsMutable), getStream()); break; default: llvm_unreachable("unsupported import kind"); @@ -929,14 +930,14 @@ void WasmObjectWriter::writeDataRelocSection() { void WasmObjectWriter::writeLinkingMetaDataSection( ArrayRef Segments, uint32_t DataSize, SmallVector, 4> SymbolFlags, - bool HasStackPointer, uint32_t StackPointerGlobal) { + Optional StackPointerGlobal) { SectionBookkeeping Section; startSection(Section, wasm::WASM_SEC_CUSTOM, "linking"); SectionBookkeeping SubSection; - if (HasStackPointer) { + if (StackPointerGlobal.hasValue()) { startSection(SubSection, wasm::WASM_STACK_POINTER); - encodeULEB128(StackPointerGlobal, getStream()); // id + encodeULEB128(StackPointerGlobal.getValue(), getStream()); // id endSection(SubSection); } @@ -1010,9 +1011,9 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, SmallPtrSet IsAddressTaken; unsigned NumFuncImports = 0; SmallVector DataSegments; - uint32_t StackPointerGlobal = 0; + Optional StackPointerGlobalName; + Optional StackPointerGlobal; uint32_t DataSize = 0; - bool HasStackPointer = false; // Populate the IsAddressTaken set. for (const WasmRelocationEntry &RelEntry : CodeRelocations) { @@ -1036,41 +1037,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, } } - // Populate FunctionTypeIndices and Imports. - for (const MCSymbol &S : Asm.symbols()) { - const auto &WS = static_cast(S); - - // Register types for all functions, including those with private linkage - // (making them - // because wasm always needs a type signature. - if (WS.isFunction()) - registerFunctionType(WS); - - if (WS.isTemporary()) - continue; - - // If the symbol is not defined in this translation unit, import it. - if (!WS.isDefined(/*SetUsed=*/false)) { - WasmImport Import; - Import.ModuleName = WS.getModuleName(); - Import.FieldName = WS.getName(); - - if (WS.isFunction()) { - Import.Kind = wasm::WASM_EXTERNAL_FUNCTION; - Import.Type = getFunctionType(WS); - SymbolIndices[&WS] = NumFuncImports; - ++NumFuncImports; - } else { - Import.Kind = wasm::WASM_EXTERNAL_GLOBAL; - Import.Type = int32_t(PtrType); - SymbolIndices[&WS] = NumGlobalImports; - ++NumGlobalImports; - } - - Imports.push_back(Import); - } - } - // In the special .global_variables section, we've encoded global // variables used by the function. Translate them into the Globals // list. @@ -1143,10 +1109,52 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, if (!DataFrag.getFixups().empty()) report_fatal_error("fixups not supported in .stack_pointer"); const SmallVectorImpl &Contents = DataFrag.getContents(); - if (Contents.size() != 4) - report_fatal_error("only one entry supported in .stack_pointer"); - HasStackPointer = true; - StackPointerGlobal = NumGlobalImports + *(const int32_t *)Contents.data(); + StackPointerGlobalName = StringRef(Contents.data(), Contents.size()); + } + + // Populate FunctionTypeIndices and Imports. + for (const MCSymbol &S : Asm.symbols()) { + const auto &WS = static_cast(S); + + // Register types for all functions, including those with private linkage + // (making them + // because wasm always needs a type signature. + if (WS.isFunction()) + registerFunctionType(WS); + + if (WS.isTemporary()) + continue; + + // If the symbol is not defined in this translation unit, import it. + if (!WS.isDefined(/*SetUsed=*/false)) { + WasmImport Import; + Import.ModuleName = WS.getModuleName(); + Import.FieldName = WS.getName(); + + if (WS.isFunction()) { + Import.Kind = wasm::WASM_EXTERNAL_FUNCTION; + Import.Type = getFunctionType(WS); + SymbolIndices[&WS] = NumFuncImports; + ++NumFuncImports; + } else { + Import.Kind = wasm::WASM_EXTERNAL_GLOBAL; + Import.Type = int32_t(PtrType); + Import.IsMutable = false; + SymbolIndices[&WS] = NumGlobalImports; + + // If this global is the stack pointer, make it mutable and remember it + // so that we can emit metadata for it. + if (StackPointerGlobalName.hasValue() && + WS.getName() == StackPointerGlobalName.getValue()) { + Import.IsMutable = true; + StackPointerGlobal = NumGlobalImports; + } + + ++NumGlobalImports; + } + + Imports.push_back(Import); + } } for (MCSection &Sec : Asm) { @@ -1331,7 +1339,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, writeCodeRelocSection(); writeDataRelocSection(); writeLinkingMetaDataSection(DataSegments, DataSize, SymbolFlags, - HasStackPointer, StackPointerGlobal); + StackPointerGlobal); // TODO: Translate the .comment section to the output. // TODO: Translate debug sections to the output. diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 5d95a9a9a56dd..5723f8fcf5bb3 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -1,4 +1,7 @@ set(system_libs) +if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ ) + set(system_libs ${system_libs} ${ZLIB_LIBRARIES}) +endif() if( MSVC OR MINGW ) # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc. set(system_libs ${system_libs} psapi shell32 ole32 uuid) @@ -21,9 +24,6 @@ elseif( CMAKE_HOST_UNIX ) set(system_libs ${system_libs} atomic) endif() set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB}) - if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ ) - set(system_libs ${system_libs} z) - endif() if( UNIX AND NOT (BEOS OR HAIKU) ) set(system_libs ${system_libs} m) endif() diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index d4b9d02e030da..f229f23a4f84a 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -1099,8 +1099,14 @@ Error TempFile::keep(const Twine &Name) { std::error_code RenameEC = cancelDeleteOnClose(FD); if (!RenameEC) RenameEC = rename_fd(FD, Name); + // If we can't rename, discard the temporary file. + if (RenameEC) + removeFD(FD); #else std::error_code RenameEC = fs::rename(TmpName, Name); + // If we can't rename, discard the temporary file. + if (RenameEC) + remove(TmpName); sys::DontRemoveFileOnSignal(TmpName); #endif diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp index 5009607a4780c..abc46d0765762 100644 --- a/lib/Support/TarWriter.cpp +++ b/lib/Support/TarWriter.cpp @@ -173,6 +173,10 @@ void TarWriter::append(StringRef Path, StringRef Data) { // Write Path and Data. std::string Fullpath = BaseDir + "/" + sys::path::convert_to_slash(Path); + // We do not want to include the same file more than once. + if (!Files.insert(Fullpath).second) + return; + StringRef Prefix; StringRef Name; if (splitUstar(Fullpath, Prefix, Name)) { diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc index f5b1c0ffe69d4..f81790b17df57 100644 --- a/lib/Support/Windows/Path.inc +++ b/lib/Support/Windows/Path.inc @@ -391,6 +391,20 @@ std::error_code is_local(int FD, bool &Result) { return is_local_internal(FinalPath, Result); } +static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) { + FILE_DISPOSITION_INFO Disposition; + Disposition.DeleteFile = Delete; + if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition, + sizeof(Disposition))) + return mapWindowsError(::GetLastError()); + return std::error_code(); +} + +static std::error_code removeFD(int FD) { + HANDLE Handle = reinterpret_cast(_get_osfhandle(FD)); + return setDeleteDisposition(Handle, true); +} + /// In order to handle temporary files we want the following properties /// /// * The temporary file is deleted on crashes @@ -425,11 +439,9 @@ static std::error_code cancelDeleteOnClose(int &FD) { if (close(FD)) return mapWindowsError(::GetLastError()); - FILE_DISPOSITION_INFO Disposition; - Disposition.DeleteFile = false; - if (!SetFileInformationByHandle(NewHandle, FileDispositionInfo, &Disposition, - sizeof(Disposition))) - return mapWindowsError(::GetLastError()); + if (std::error_code EC = setDeleteDisposition(NewHandle, false)) + return EC; + FD = ::_open_osfhandle(intptr_t(NewHandle), 0); if (FD == -1) { ::CloseHandle(NewHandle); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 00c4504464858..faddd39f79db5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4006,9 +4006,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. - if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS)) { - assert((CC == ISD::SETEQ || CC == ISD::SETNE) && - "Unexpected condition code."); + if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 6999721b6260c..c2d3ae31c6243 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -889,6 +889,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } + auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + const unsigned PtrReg = I.getOperand(1).getReg(); #ifndef NDEBUG const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 6531d5ebe4c0e..05df512022298 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -231,14 +231,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { setAction({MemOp, 1, p0}, Legal); } - for (unsigned MemOp : {G_ATOMIC_LOAD, G_ATOMIC_STORE}) { - for (auto Ty : {s8, s16, s32, s64, p0}) - setAction({MemOp, Ty}, Legal); - - // And everything's fine in addrspace 0. - setAction({MemOp, 1, p0}, Legal); - } - // Constants for (auto Ty : {s32, s64}) { setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 22d724628376f..ca04097e1cb62 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -135,6 +135,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), FP64(false), + FMA(false), IsGCN(false), GCN3Encoding(false), CIInsts(false), diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 38803204d6e7e..b325a49e11fa7 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -330,8 +330,9 @@ void GCNScheduleDAGMILive::schedule() { std::vector Unsched; Unsched.reserve(NumRegionInstrs); - for (auto &I : *this) + for (auto &I : *this) { Unsched.push_back(&I); + } GCNRegPressure PressureBefore; if (LIS) { @@ -387,6 +388,9 @@ void GCNScheduleDAGMILive::schedule() { DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { + if (MI->isDebugValue()) + continue; + if (MI->getIterator() != RegionEnd) { BB->remove(MI); BB->insert(RegionEnd, MI); diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 6b2668fe052f7..647aafe4d2ea9 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -134,8 +134,11 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { } while (I != E) { - if (I->isDebugValue()) + if (I->isDebugValue()) { + I = std::next(I); continue; + } + if (I->mayStore() || I->isBarrier() || I->isCall() || I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef()) break; diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 59bc44bdfa052..5fda45f6a7fca 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -324,7 +324,8 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg, return &DefMO; } - llvm_unreachable("invalid reg"); + // Ignore implicit defs. + return nullptr; } uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index b43faf4903a85..6bbeae2e11514 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -801,6 +801,12 @@ bool ARMInstructionSelector::select(MachineInstr &I, return selectGlobal(MIB, MRI); case G_STORE: case G_LOAD: { + const auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + unsigned Reg = I.getOperand(0).getReg(); unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID(); diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 25f101b493131..0931eb8d8bfbc 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1928,7 +1928,8 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, for (auto *B : L->blocks()) for (auto &I : *B) - if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access)) + if (Ignored.count(&I) == 0 && + intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)) return true; return false; diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td index 41300a1b62954..06905fb41e444 100644 --- a/lib/Target/SystemZ/SystemZ.td +++ b/lib/Target/SystemZ/SystemZ.td @@ -58,7 +58,7 @@ include "SystemZInstrHFP.td" include "SystemZInstrDFP.td" include "SystemZInstrSystem.td" -def SystemZInstrInfo : InstrInfo {} +def SystemZInstrInfo : InstrInfo { let guessInstructionProperties = 0; } //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index ad14e5e34e2ec..c239cd5ad4688 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -2201,6 +2201,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, NewC.Op0.getOpcode() == ISD::SHL && isSimpleShift(NewC.Op0, ShiftVal) && (MaskVal >> ShiftVal != 0) && + ((CmpVal >> ShiftVal) << ShiftVal) == CmpVal && (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal >> ShiftVal, CmpVal >> ShiftVal, @@ -2211,6 +2212,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, NewC.Op0.getOpcode() == ISD::SRL && isSimpleShift(NewC.Op0, ShiftVal) && (MaskVal << ShiftVal != 0) && + ((CmpVal << ShiftVal) >> ShiftVal) == CmpVal && (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal << ShiftVal, CmpVal << ShiftVal, diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index 02aeaadad0d9a..16edbea87cdac 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +// TODO: Most floating-point instructions (except for simple moves and the +// like) can raise exceptions -- should they have hasSideEffects=1 ? + //===----------------------------------------------------------------------===// // Select instructions //===----------------------------------------------------------------------===// @@ -29,22 +32,20 @@ defm CondStoreF64 : CondStores; def LZDR : InherentRRE<"lzdr", 0xB375, FP64, fpimm0>; def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>; } // Moves between two floating-point registers. -let hasSideEffects = 0 in { - def LER : UnaryRR <"ler", 0x38, null_frag, FP32, FP32>; - def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>; - def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>; +def LER : UnaryRR <"ler", 0x38, null_frag, FP32, FP32>; +def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>; +def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>; - // For z13 we prefer LDR over LER to avoid partial register dependencies. - let isCodeGenOnly = 1 in - def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>; -} +// For z13 we prefer LDR over LER to avoid partial register dependencies. +let isCodeGenOnly = 1 in + def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>; // Moves between two floating-point registers that also set the condition // codes. @@ -130,7 +131,7 @@ defm LoadStoreF128 : MVCLoadStore; // Load instructions //===----------------------------------------------------------------------===// -let canFoldAsLoad = 1, SimpleBDXLoad = 1 in { +let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in { defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>; defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>; @@ -150,7 +151,7 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in { // Store instructions //===----------------------------------------------------------------------===// -let SimpleBDXStore = 1 in { +let SimpleBDXStore = 1, mayStore = 1 in { defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>; defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>; @@ -525,11 +526,14 @@ let Defs = [CC], CCValues = 0xC in { //===----------------------------------------------------------------------===// let hasSideEffects = 1 in { - def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>; - def STFPC : StoreInherentS<"stfpc", 0xB29C, storei, 4>; + let mayLoad = 1, mayStore = 1 in { + // TODO: EFPC and SFPC do not touch memory at all + def EFPC : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>; + def STFPC : StoreInherentS<"stfpc", 0xB29C, storei, 4>; - def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>; - def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu, 4>; + def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>; + def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu, 4>; + } def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>; def LFAS : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>; diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 033a0a879d37d..06da66ad87645 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -21,6 +21,10 @@ class InstSystemZ opcode> : InstRXYb { let CCMaskFirst = 1; + let mayLoad = 1; } class AsmCondBranchRXY opcode> : InstRXYb; + mnemonic#"\t$M1, $XBD2", []> { + let mayLoad = 1; +} class FixedCondBranchRXY opcode, SDPatternOperator operator = null_frag> @@ -2113,6 +2120,7 @@ class FixedCondBranchRXY opcode, [(operator (load bdxaddr20only:$XBD2))]> { let isAsmParserOnly = V.alternate; let M1 = V.ccmask; + let mayLoad = 1; } class CmpBranchRIEa opcode, @@ -2784,7 +2792,6 @@ multiclass CondUnaryRSYPair opcode, def Asm : AsmCondUnaryRSY; } - class UnaryRX opcode, SDPatternOperator operator, RegisterOperand cls, bits<5> bytes, AddressingMode mode = bdxaddr12only> @@ -4688,7 +4695,8 @@ class SelectWrapper // Stores $new to $addr if $cc is true ("" case) or false (Inv case). multiclass CondStores { - let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in { + let Defs = [CC], Uses = [CC], usesCustomInserter = 1, + mayLoad = 1, mayStore = 1 in { def "" : Pseudo<(outs), (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask cls:$new, (load mode:$addr), diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 55a796cddf437..87462c1d681de 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -11,24 +11,25 @@ // Stack allocation //===----------------------------------------------------------------------===// -let hasNoSchedulingInfo = 1 in { +// The callseq_start node requires the hasSideEffects flag, even though these +// instructions are noops on SystemZ. +let hasNoSchedulingInfo = 1, hasSideEffects = 1 in { def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } -let hasSideEffects = 0 in { - // Takes as input the value of the stack pointer after a dynamic allocation - // has been made. Sets the output to the address of the dynamically- - // allocated area itself, skipping the outgoing arguments. - // - // This expands to an LA or LAY instruction. We restrict the offset - // to the range of LA and keep the LAY range in reserve for when - // the size of the outgoing arguments is added. - def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src), - [(set GR64:$dst, dynalloc12only:$src)]>; -} +// Takes as input the value of the stack pointer after a dynamic allocation +// has been made. Sets the output to the address of the dynamically- +// allocated area itself, skipping the outgoing arguments. +// +// This expands to an LA or LAY instruction. We restrict the offset +// to the range of LA and keep the LAY range in reserve for when +// the size of the outgoing arguments is added. +def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src), + [(set GR64:$dst, dynalloc12only:$src)]>; + //===----------------------------------------------------------------------===// // Branch instructions @@ -197,15 +198,15 @@ let isBranch = 1, isTerminator = 1 in { //===----------------------------------------------------------------------===// // Unconditional trap. -let hasCtrlDep = 1 in +let hasCtrlDep = 1, hasSideEffects = 1 in def Trap : Alias<4, (outs), (ins), [(trap)]>; // Conditional trap. -let hasCtrlDep = 1, Uses = [CC] in +let hasCtrlDep = 1, Uses = [CC], hasSideEffects = 1 in def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>; // Fused compare-and-trap instructions. -let hasCtrlDep = 1 in { +let hasCtrlDep = 1, hasSideEffects = 1 in { // These patterns work the same way as for compare-and-branch. defm CRT : CmpBranchRRFcPair<"crt", 0xB972, GR32>; defm CGRT : CmpBranchRRFcPair<"cgrt", 0xB960, GR64>; @@ -360,13 +361,12 @@ defm CondStore64 : CondStores, - Requires<[FeatureHighWord]>; - def LR : UnaryRR <"lr", 0x18, null_frag, GR32, GR32>; - def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>; -} +// Expands to LR, RISBHG or RISBLG, depending on the choice of registers. +def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>, + Requires<[FeatureHighWord]>; +def LR : UnaryRR <"lr", 0x18, null_frag, GR32, GR32>; +def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>; + let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { def LTR : UnaryRR <"ltr", 0x12, null_frag, GR32, GR32>; def LTGR : UnaryRRE<"ltgr", 0xB902, null_frag, GR64, GR64>; @@ -376,8 +376,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in def PAIR128 : Pseudo<(outs GR128:$dst), (ins GR64:$hi, GR64:$lo), []>; // Immediate moves. -let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1, - isReMaterializable = 1 in { +let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in { // 16-bit sign-extended immediates. LHIMux expands to LHI or IIHF, // deopending on the choice of register. def LHIMux : UnaryRIPseudo, @@ -398,7 +397,7 @@ let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1, } // Register loads. -let canFoldAsLoad = 1, SimpleBDXLoad = 1 in { +let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in { // Expands to L, LY or LFH, depending on the choice of register. def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>, Requires<[FeatureHighWord]>; @@ -435,14 +434,14 @@ let Predicates = [FeatureLoadAndZeroRightmostByte] in { } // Load and trap. -let Predicates = [FeatureLoadAndTrap] in { +let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in { def LAT : UnaryRXY<"lat", 0xE39F, null_frag, GR32, 4>; def LFHAT : UnaryRXY<"lfhat", 0xE3C8, null_frag, GRH32, 4>; def LGAT : UnaryRXY<"lgat", 0xE385, null_frag, GR64, 8>; } // Register stores. -let SimpleBDXStore = 1 in { +let SimpleBDXStore = 1, mayStore = 1 in { // Expands to ST, STY or STFH, depending on the choice of register. def STMux : StoreRXYPseudo, Requires<[FeatureHighWord]>; @@ -489,17 +488,16 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in { // Load immediate on condition. Matched via DAG pattern and created // by the PeepholeOptimizer via FoldImmediate. - let hasSideEffects = 0 in { - // Expands to LOCHI or LOCHHI, depending on the choice of register. - def LOCHIMux : CondBinaryRIEPseudo; - defm LOCHHI : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>; - defm LOCHI : CondBinaryRIEPair<"lochi", 0xEC42, GR32, imm32sx16>; - defm LOCGHI : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>; - } + + // Expands to LOCHI or LOCHHI, depending on the choice of register. + def LOCHIMux : CondBinaryRIEPseudo; + defm LOCHHI : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>; + defm LOCHI : CondBinaryRIEPair<"lochi", 0xEC42, GR32, imm32sx16>; + defm LOCGHI : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>; // Move register on condition. Expanded from Select* pseudos and // created by early if-conversion. - let hasSideEffects = 0, isCommutable = 1 in { + let isCommutable = 1 in { // Expands to LOCR or LOCFHR or a branch-and-move sequence, // depending on the choice of registers. def LOCRMux : CondBinaryRRFPseudo; @@ -534,7 +532,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in { let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in { // Move register on condition. Expanded from Select* pseudos and // created by early if-conversion. - let hasSideEffects = 0, isCommutable = 1 in { + let isCommutable = 1 in { defm LOCR : CondBinaryRRFPair<"locr", 0xB9F2, GR32, GR32>; defm LOCGR : CondBinaryRRFPair<"locgr", 0xB9E2, GR64, GR64>; } @@ -570,17 +568,14 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in { //===----------------------------------------------------------------------===// // 32-bit extensions from registers. -let hasSideEffects = 0 in { - def LBR : UnaryRRE<"lbr", 0xB926, sext8, GR32, GR32>; - def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>; -} +def LBR : UnaryRRE<"lbr", 0xB926, sext8, GR32, GR32>; +def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>; // 64-bit extensions from registers. -let hasSideEffects = 0 in { - def LGBR : UnaryRRE<"lgbr", 0xB906, sext8, GR64, GR64>; - def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>; - def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>; -} +def LGBR : UnaryRRE<"lgbr", 0xB906, sext8, GR64, GR64>; +def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>; +def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>; + let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in def LTGFR : UnaryRRE<"ltgfr", 0xB912, null_frag, GR64, GR32>; @@ -620,23 +615,20 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in //===----------------------------------------------------------------------===// // 32-bit extensions from registers. -let hasSideEffects = 0 in { - // Expands to LLCR or RISB[LH]G, depending on the choice of registers. - def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>, - Requires<[FeatureHighWord]>; - def LLCR : UnaryRRE<"llcr", 0xB994, zext8, GR32, GR32>; - // Expands to LLHR or RISB[LH]G, depending on the choice of registers. - def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>, - Requires<[FeatureHighWord]>; - def LLHR : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>; -} + +// Expands to LLCR or RISB[LH]G, depending on the choice of registers. +def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>, + Requires<[FeatureHighWord]>; +def LLCR : UnaryRRE<"llcr", 0xB994, zext8, GR32, GR32>; +// Expands to LLHR or RISB[LH]G, depending on the choice of registers. +def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>, + Requires<[FeatureHighWord]>; +def LLHR : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>; // 64-bit extensions from registers. -let hasSideEffects = 0 in { - def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8, GR64, GR64>; - def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>; - def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>; -} +def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8, GR64, GR64>; +def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>; +def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>; // Match 32-to-64-bit zero extensions in which the source is already // in a 64-bit register. @@ -683,7 +675,7 @@ let Predicates = [FeatureLoadAndZeroRightmostByte] in { } // Load and trap. -let Predicates = [FeatureLoadAndTrap] in { +let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in { def LLGFAT : UnaryRXY<"llgfat", 0xE39D, null_frag, GR64, 4>; def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>; } @@ -760,10 +752,8 @@ def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>; //===----------------------------------------------------------------------===// // Byte-swapping register moves. -let hasSideEffects = 0 in { - def LRVR : UnaryRRE<"lrvr", 0xB91F, bswap, GR32, GR32>; - def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>; -} +def LRVR : UnaryRRE<"lrvr", 0xB91F, bswap, GR32, GR32>; +def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>; // Byte-swapping loads. Unlike normal loads, these instructions are // allowed to access storage more than once. @@ -785,13 +775,12 @@ let mayLoad = 1, mayStore = 1 in //===----------------------------------------------------------------------===// // Load BDX-style addresses. -let hasSideEffects = 0, isAsCheapAsAMove = 1, isReMaterializable = 1 in +let isAsCheapAsAMove = 1, isReMaterializable = 1 in defm LA : LoadAddressRXPair<"la", 0x41, 0xE371, bitconvert>; // Load a PC-relative address. There's no version of this instruction // with a 16-bit offset, so there's no relaxation. -let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1, - isReMaterializable = 1 in +let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in def LARL : LoadAddressRIL<"larl", 0xC00, bitconvert>; // Load the Global Offset Table address. This will be lowered into a @@ -1267,6 +1256,7 @@ def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>, Requires<[FeatureMiscellaneousExtensions2]>; def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>; + def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2), (MGRK GR64:$src1, GR64:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2), @@ -1279,6 +1269,7 @@ def MG : BinaryRXY<"mg", 0xE384, null_frag, GR128, load, 8>, Requires<[FeatureMiscellaneousExtensions2]>; def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>; + def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), @@ -1328,11 +1319,9 @@ def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))), //===----------------------------------------------------------------------===// // Logical shift left. -let hasSideEffects = 0 in { - defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; - def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; - def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; -} +defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; +def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; +def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; // Arithmetic shift left. let Defs = [CC] in { @@ -1342,11 +1331,9 @@ let Defs = [CC] in { } // Logical shift right. -let hasSideEffects = 0 in { - defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; - def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; - def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; -} +defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; +def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; +def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { @@ -1356,10 +1343,8 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { } // Rotate left. -let hasSideEffects = 0 in { - def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; - def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; -} +def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; +def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; // Rotate second operand left and inserted selected bits into first operand. // These can act like 32-bit operands provided that the constant start and @@ -1550,10 +1535,12 @@ let Defs = [CC] in { // Prefetch and execution hint //===----------------------------------------------------------------------===// -def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>; -def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>; +let mayLoad = 1, mayStore = 1 in { + def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>; + def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>; +} -let Predicates = [FeatureExecutionHint] in { +let Predicates = [FeatureExecutionHint], hasSideEffects = 1 in { // Branch Prediction Preload def BPP : BranchPreloadSMI<"bpp", 0xC7>; def BPRP : BranchPreloadMII<"bprp", 0xC5>; @@ -1820,7 +1807,10 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { // Guarded storage //===----------------------------------------------------------------------===// -let Predicates = [FeatureGuardedStorage] in { +// These instructions use and/or modify the guarded storage control +// registers, which we do not otherwise model, so they should have +// hasSideEffects. +let Predicates = [FeatureGuardedStorage], hasSideEffects = 1 in { def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>; def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>; @@ -1896,7 +1886,7 @@ defm LAE : LoadAddressRXPair<"lae", 0x51, 0xE375, null_frag>; // Load access multiple. defm LAM : LoadMultipleRSPair<"lam", 0x9A, 0xEB9A, AR32>; -// Load access multiple. +// Store access multiple. defm STAM : StoreMultipleRSPair<"stam", 0x9B, 0xEB9B, AR32>; //===----------------------------------------------------------------------===// @@ -1945,7 +1935,6 @@ let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in { let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in { def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>; def TBEGIN_nofloat : SideEffectBinarySILPseudo; - def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561, int_s390_tbeginc, imm32zx16>; } @@ -1955,7 +1944,9 @@ let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in { def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>; // Transaction Abort - let isTerminator = 1, isBarrier = 1 in + // TODO: Shouldn't be mayLoad or mayStore. + let isTerminator = 1, isBarrier = 1, mayLoad = 1, mayStore = 1, + hasSideEffects = 1 in def TABORT : SideEffectAddressS<"tabort", 0xB2FC, int_s390_tabort>; // Nontransactional Store @@ -2031,7 +2022,7 @@ let hasSideEffects = 1 in { // .insn directive instructions //===----------------------------------------------------------------------===// -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 1 in { def InsnE : DirectiveInsnE<(outs), (ins imm64zx16:$enc), ".insn e,$enc", []>; def InsnRI : DirectiveInsnRI<(outs), (ins imm64zx32:$enc, AnyReg:$R1, imm32sx16:$I2), diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td index 0112ebf1eb10c..c351577fa5bd1 100644 --- a/lib/Target/SystemZ/SystemZInstrSystem.td +++ b/lib/Target/SystemZ/SystemZInstrSystem.td @@ -23,7 +23,7 @@ let hasSideEffects = 1, Uses = [CC] in def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>; // Load PSW (extended). -let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in { +let hasSideEffects = 1, Defs = [CC] in { def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>; def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>; } @@ -37,7 +37,7 @@ let hasSideEffects = 1 in def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>; // Set system mask. -let hasSideEffects = 1, mayLoad = 1 in +let hasSideEffects = 1 in def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>; // Store then AND/OR system mask. @@ -60,13 +60,15 @@ let hasSideEffects = 1 in { // Control Register Instructions. //===----------------------------------------------------------------------===// -// Load control. -def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>; -def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>; +let hasSideEffects = 1 in { + // Load control. + def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>; + def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>; -// Store control. -def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>; -def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>; + // Store control. + def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>; + def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>; +} // Extract primary ASN (and instance). let hasSideEffects = 1 in { diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index c9a02d9c80821..92b86575235a9 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -56,8 +56,7 @@ def : VectorExtractSubreg; //===----------------------------------------------------------------------===// let Predicates = [FeatureVector] in { - let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1, - isReMaterializable = 1 in { + let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in { // Generate byte mask. def VZERO : InherentVRIa<"vzero", 0xE744, 0>; @@ -141,8 +140,10 @@ let Predicates = [FeatureVector] in { // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VL32 : UnaryAliasVRX; - def VL64 : UnaryAliasVRX; + let mayLoad = 1 in { + def VL32 : UnaryAliasVRX; + def VL64 : UnaryAliasVRX; + } // Load logical element and zero. def VLLEZ : UnaryVRXGeneric<"vllez", 0xE704>; @@ -231,8 +232,10 @@ let Predicates = [FeatureVector] in { // STEY and STDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VST32 : StoreAliasVRX; - def VST64 : StoreAliasVRX; + let mayStore = 1 in { + def VST32 : StoreAliasVRX; + def VST64 : StoreAliasVRX; + } // Scatter element. def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index 5f8f8ca9143d8..8787a90b1e259 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -51,6 +51,8 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo { const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; + bool enableMultipleCopyHints() const override { return true; } + // Override TargetRegisterInfo.h. bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index c82a64d58246e..2437b01768815 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -108,8 +108,8 @@ void WebAssemblyTargetAsmStreamer::emitGlobal( } } -void WebAssemblyTargetAsmStreamer::emitStackPointer(uint32_t Index) { - OS << "\t.stack_pointer\t" << Index << '\n'; +void WebAssemblyTargetAsmStreamer::emitStackPointer(MCSymbol *Symbol) { + OS << "\t.stack_pointer\t" << Symbol->getName() << '\n'; } void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } @@ -158,7 +158,7 @@ void WebAssemblyTargetELFStreamer::emitGlobal( } void WebAssemblyTargetELFStreamer::emitStackPointer( - uint32_t Index) { + MCSymbol *Symbol) { llvm_unreachable(".stack_pointer encoding not yet implemented"); } @@ -238,11 +238,11 @@ void WebAssemblyTargetWasmStreamer::emitGlobal( Streamer.PopSection(); } -void WebAssemblyTargetWasmStreamer::emitStackPointer(uint32_t Index) { +void WebAssemblyTargetWasmStreamer::emitStackPointer(MCSymbol *Symbol) { Streamer.PushSection(); Streamer.SwitchSection(Streamer.getContext().getWasmSection( ".stack_pointer", SectionKind::getMetadata())); - Streamer.EmitIntValue(Index, 4); + Streamer.EmitBytes(Symbol->getName()); Streamer.PopSection(); } @@ -277,4 +277,5 @@ void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType( } void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) { + llvm_unreachable(".global_import is not needed for direct wasm output"); } diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h index 102d7219a1e74..db908572a58d9 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -40,7 +40,7 @@ class WebAssemblyTargetStreamer : public MCTargetStreamer { /// .globalvar virtual void emitGlobal(ArrayRef Globals) = 0; /// .stack_pointer - virtual void emitStackPointer(uint32_t Index) = 0; + virtual void emitStackPointer(MCSymbol *Symbol) = 0; /// .endfunc virtual void emitEndFunc() = 0; /// .functype @@ -67,7 +67,7 @@ class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer { void emitResult(MCSymbol *Symbol, ArrayRef Types) override; void emitLocal(ArrayRef Types) override; void emitGlobal(ArrayRef Globals) override; - void emitStackPointer(uint32_t Index) override; + void emitStackPointer(MCSymbol *Symbol) override; void emitEndFunc() override; void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl &Params, @@ -85,7 +85,7 @@ class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer { void emitResult(MCSymbol *Symbol, ArrayRef Types) override; void emitLocal(ArrayRef Types) override; void emitGlobal(ArrayRef Globals) override; - void emitStackPointer(uint32_t Index) override; + void emitStackPointer(MCSymbol *Symbol) override; void emitEndFunc() override; void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl &Params, @@ -103,7 +103,7 @@ class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer { void emitResult(MCSymbol *Symbol, ArrayRef Types) override; void emitLocal(ArrayRef Types) override; void emitGlobal(ArrayRef Globals) override; - void emitStackPointer(uint32_t Index) override; + void emitStackPointer(MCSymbol *Symbol) override; void emitEndFunc() override; void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl &Params, diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index ee60c8f3a7a3b..1d606d49bed40 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -78,6 +78,10 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() { //===----------------------------------------------------------------------===// void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { + // Declare the stack pointer. + getTargetStreamer()->emitStackPointer( + GetExternalSymbolSymbol("__stack_pointer")); + for (const auto &F : M) { // Emit function type info for all undefined functions if (F.isDeclarationForLinker() && !F.isIntrinsic()) { @@ -91,7 +95,8 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { for (const auto &G : M.globals()) { if (!G.hasInitializer() && G.hasExternalLinkage()) { uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType()); - getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier()); + if (TM.getTargetTriple().isOSBinFormatELF()) + getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier()); OutStreamer->emitELFSize(getSymbol(&G), MCConstantExpr::create(Size, OutContext)); } diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index f205d3ebfbf7f..573b7caf29c0a 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -70,13 +70,13 @@ static RegDomain getDomain(const TargetRegisterClass *RC, static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC, RegDomain Domain) { assert(Domain == MaskDomain && "add domain"); - if (SrcRC == &X86::GR8RegClass) + if (X86::GR8RegClass.hasSubClassEq(SrcRC)) return &X86::VK8RegClass; - if (SrcRC == &X86::GR16RegClass) + if (X86::GR16RegClass.hasSubClassEq(SrcRC)) return &X86::VK16RegClass; - if (SrcRC == &X86::GR32RegClass) + if (X86::GR32RegClass.hasSubClassEq(SrcRC)) return &X86::VK32RegClass; - if (SrcRC == &X86::GR64RegClass) + if (X86::GR64RegClass.hasSubClassEq(SrcRC)) return &X86::VK64RegClass; llvm_unreachable("add register class"); return nullptr; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 073b4e01377aa..5f013753ea827 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1687,6 +1687,19 @@ bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } +bool X86TargetLowering::useStackGuardXorFP() const { + // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. + return Subtarget.getTargetTriple().isOSMSVCRT(); +} + +SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const { + EVT PtrTy = getPointerTy(DAG.getDataLayout()); + unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; + MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); + return SDValue(Node, 0); +} + TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && @@ -8318,6 +8331,11 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, V1.getValueType().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS"); + // If this can be done with a subreg insert do that first. + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (ResVT.getSizeInBits() >= 16) return Op; // The operation is legal with KUNPCK @@ -8327,9 +8345,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, if (IsZeroV1 && IsZeroV2) return ZeroVec; - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - if (V2.isUndef()) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); if (IsZeroV2) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); @@ -16086,9 +16101,6 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); - // Optimize vectors in AVX mode: // // v8i16 -> v8i32 @@ -16140,24 +16152,51 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, if (InVT.getVectorElementType() != MVT::i1) return SDValue(); - // Extend VT if the target is 256 or 128bit vector and VLX is not supported. + // Extend VT if the scalar type is v8/v16 and BWI is not supported. MVT ExtVT = VT; - if (!VT.is512BitVector() && !Subtarget.hasVLX()) - ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + if (!Subtarget.hasBWI() && + (VT.getVectorElementType().getSizeInBits() <= 16)) + ExtVT = MVT::getVectorVT(MVT::i32, NumElts); - SDValue One = - DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); - SDValue Zero = - DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); + // Widen to 512-bits if VLX is not supported. + MVT WideVT = ExtVT; + if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { + NumElts *= 512 / ExtVT.getSizeInBits(); + InVT = MVT::getVectorVT(MVT::i1, NumElts); + In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), + In, DAG.getIntPtrConstant(0, DL)); + WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), + NumElts); + } + + SDValue One = DAG.getConstant(1, DL, WideVT); + SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL); + + SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); + + // Truncate if we had to extend i16/i8 above. + if (VT != ExtVT) { + WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); + SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal); + } + + // Extract back to 128/256-bit if we widened. + if (WideVT != VT) + SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, + DAG.getIntPtrConstant(0, DL)); - SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); - if (VT == ExtVT) - return SelectedVal; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); + return SelectedVal; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Op), VT, In); + if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; @@ -16167,7 +16206,6 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); @@ -18268,14 +18306,6 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); - // SKX processor - if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - - ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); - unsigned NumElts = VT.getVectorNumElements(); if (VT.is512BitVector() && InVTElt != MVT::i1 && @@ -18288,28 +18318,44 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, if (InVTElt != MVT::i1) return SDValue(); + // Extend VT if the scalar type is v8/v16 and BWI is not supported. MVT ExtVT = VT; - if (!VT.is512BitVector() && !Subtarget.hasVLX()) { - ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); - } else if (VTElt == MVT::i16 || VTElt == MVT::i8) { - // If we don't have BWI support we need to extend 8/16-bit to 32-bit. - // Otherwise we end up with vselects we can't handle. + if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) ExtVT = MVT::getVectorVT(MVT::i32, NumElts); + + // Widen to 512-bits if VLX is not supported. + MVT WideVT = ExtVT; + if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { + NumElts *= 512 / ExtVT.getSizeInBits(); + InVT = MVT::getVectorVT(MVT::i1, NumElts); + In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), + In, DAG.getIntPtrConstant(0, dl)); + WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue V; - if (Subtarget.hasDQI()) { - V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); - assert(!VT.is512BitVector() && "Unexpected vector type"); + MVT WideEltVT = WideVT.getVectorElementType(); + if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || + (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { + V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG); } else { - SDValue NegOne = getOnesVector(ExtVT, DAG, dl); - SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); - if (ExtVT == VT) - return V; + SDValue NegOne = getOnesVector(WideVT, DAG, dl); + SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl); + V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } - return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); + // Truncate if we had to extend i16/i8 above. + if (VT != ExtVT) { + WideVT = MVT::getVectorVT(VTElt, NumElts); + V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V); + } + + // Extract back to 128/256-bit if we widened. + if (WideVT != VT) + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, + DAG.getIntPtrConstant(0, dl)); + + return V; } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. @@ -23770,12 +23816,13 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (Subtarget.hasXOP()) + MVT VT = Op.getSimpleValueType(); + + if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); - MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); SDLoc DL(Op); @@ -29973,6 +30020,53 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); + if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && + N0->getOpcode() == ISD::OR) { + SDValue Op0 = N0->getOperand(0); + SDValue Op1 = N0->getOperand(1); + MVT TrunckVT; + MVT BitcastVT; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v16i1: + TrunckVT = MVT::i8; + BitcastVT = MVT::v8i1; + break; + case MVT::v32i1: + TrunckVT = MVT::i16; + BitcastVT = MVT::v16i1; + break; + case MVT::v64i1: + TrunckVT = MVT::i32; + BitcastVT = MVT::v32i1; + break; + } + bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; + bool isArg0UndefLeft = + Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; + bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; + bool isArg1UndefLeft = + Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; + SDValue OpLeft; + SDValue OpRight; + if (isArg0UndefRight && isArg1UndefLeft) { + OpLeft = Op0; + OpRight = Op1; + } else if (isArg1UndefRight && isArg0UndefLeft) { + OpLeft = Op1; + OpRight = Op0; + } else + return SDValue(); + SDLoc DL(BitCast); + SDValue Shr = OpLeft->getOperand(0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); + SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); + SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); + SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); + } + if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 90830f4d5d110..d31104f943346 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1055,9 +1055,13 @@ namespace llvm { Value *getIRStackGuard(IRBuilder<> &IRB) const override; bool useLoadStackGuardNode() const override; + bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; Value *getSSPStackGuardCheck(const Module &M) const override; + SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const override; + /// Return true if the target stores SafeStack pointer at a fixed offset in /// some non-standard address space, and populates the address space and diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7a4ba0eae733c..ceba86a583d6d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -311,7 +311,7 @@ multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0> : AVX512_maskable; @@ -340,7 +340,7 @@ multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_3src_scalar O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, InstrItinClass itin = NoItinerary, + dag RHS, InstrItinClass itin, bit IsCommutable = 0, bit IsKCommutable = 0, bit MaskOnly = 0> : @@ -353,7 +353,7 @@ multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, list Pattern, - InstrItinClass itin = NoItinerary> : + InstrItinClass itin> : AVX512_maskable_custom O, Format F, string AttSrcAsm, string IntelSrcAsm, list Pattern, list MaskingPattern, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512 O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0> : AVX512_maskable_custom_cmp O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_cmp O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, InstrItinClass itin = NoItinerary, + dag RHS, InstrItinClass itin, bit IsCommutable = 0> : AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_cmp_alt O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - InstrItinClass itin = NoItinerary> : + InstrItinClass itin> : AVX512_maskable_custom_cmp; @@ -422,7 +422,7 @@ multiclass AVX512_maskable_logic O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskedRHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0, SDNode Select = vselect> : AVX512_maskable_custom opc, string OpcodeStr, X86VectorVTInfo _> { + +let Sched = WriteFVarBlend in +def AVX512_BLENDM : OpndItins< + IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM +>; + +let Sched = WriteVarBlend in +def AVX512_PBLENDM : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +multiclass avx512_blendmask opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { def rr : AVX5128I, EVEX_4V; + [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rrk : AVX5128I, EVEX_4V, EVEX_K; + [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rrkz : AVX5128I, EVEX_4V, EVEX_KZ; + [], itins.rr>, EVEX_4V, EVEX_KZ, Sched<[itins.Sched]>; let mayLoad = 1 in { def rm : AVX5128I, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmk : AVX5128I, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmkz : AVX5128I, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } -multiclass avx512_blendmask_rmb opc, string OpcodeStr, X86VectorVTInfo _> { - +multiclass avx512_blendmask_rmb opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _> { let mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmb : AVX5128I, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass blendmask_dq opc, string OpcodeStr, +multiclass blendmask_dq opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_blendmask , - avx512_blendmask_rmb , EVEX_V512; + defm Z : avx512_blendmask , + avx512_blendmask_rmb , EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_blendmask, - avx512_blendmask_rmb , EVEX_V256; - defm Z128 : avx512_blendmask, - avx512_blendmask_rmb , EVEX_V128; + defm Z256 : avx512_blendmask, + avx512_blendmask_rmb, EVEX_V256; + defm Z128 : avx512_blendmask, + avx512_blendmask_rmb, EVEX_V128; } } -multiclass blendmask_bw opc, string OpcodeStr, +multiclass blendmask_bw opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasBWI] in - defm Z : avx512_blendmask , EVEX_V512; + defm Z : avx512_blendmask, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { - defm Z256 : avx512_blendmask , EVEX_V256; - defm Z128 : avx512_blendmask , EVEX_V128; + defm Z256 : avx512_blendmask, EVEX_V256; + defm Z128 : avx512_blendmask, EVEX_V128; } } -defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; -defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; -defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; -defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; -defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; -defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", AVX512_BLENDM, avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", AVX512_BLENDM, avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", AVX512_PBLENDM, avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", AVX512_PBLENDM, avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", AVX512_PBLENDM, avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_info>, VEX_W; //===----------------------------------------------------------------------===// @@ -1869,8 +1886,8 @@ defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar{ - +multiclass avx512_cmp_scalar { defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), @@ -1878,7 +1895,7 @@ multiclass avx512_cmp_scalar "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V; + imm:$cc), itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -1886,7 +1903,8 @@ multiclass avx512_cmp_scalar "vcmp${cc}"#_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + imm:$cc), itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), @@ -1896,28 +1914,31 @@ multiclass avx512_cmp_scalar (OpNodeRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, - (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_4V, EVEX_B, Sched<[itins.Sched]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, (outs VK1:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, EVEX_4V, + Sched<[itins.Sched]>; let mayLoad = 1 in defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, - EVEX_4V, EVEX_B; + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", itins.rr>, + EVEX_4V, EVEX_B, Sched<[itins.Sched]>; }// let isAsmParserOnly = 1, hasSideEffects = 0 let isCodeGenOnly = 1 in { @@ -1929,7 +1950,7 @@ multiclass avx512_cmp_scalar [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, imm:$cc))], - IIC_SSE_ALU_F32S_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), @@ -1938,33 +1959,34 @@ multiclass avx512_cmp_scalar [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let Predicates = [HasAVX512] in { let ExeDomain = SSEPackedSingle in - defm VCMPSSZ : avx512_cmp_scalar, - AVX512XSIi8Base; + defm VCMPSSZ : avx512_cmp_scalar, AVX512XSIi8Base; let ExeDomain = SSEPackedDouble in - defm VCMPSDZ : avx512_cmp_scalar, - AVX512XDIi8Base, VEX_W; + defm VCMPSDZ : avx512_cmp_scalar, AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable> { + OpndItins itins, X86VectorVTInfo _, bit IsCommutable> { let isCommutable = IsCommutable in def rr : AVX512BI, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rm : AVX512BI, EVEX_4V; + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCommutable = IsCommutable in def rrk : AVX512BI opc, string OpcodeStr, SDNode OpNode, "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rmk : AVX512BI opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + itins.rm>, EVEX_4V, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed { + OpndItins itins, X86VectorVTInfo _, bit IsCommutable> : + avx512_icmp_packed { def rmb : AVX512BI, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmbk : AVX512BI opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_icmp_packed_vl opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd, - bit IsCommutable = 0> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed, EVEX_V256; - defm Z128 : avx512_icmp_packed, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo, - Predicate prd, bit IsCommutable = 0> { + SDNode OpNode, OpndItins itins, + AVX512VLVectorVTInfo VTInfo, + Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb, EVEX_V128; } } +// FIXME: Is there a better scheduler itinerary for VPCMP? defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, - avx512vl_i8_info, HasBWI, 1>, + SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, - avx512vl_i16_info, HasBWI, 1>, + SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, - avx512vl_i32_info, HasAVX512, 1>, + SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, - avx512vl_i64_info, HasAVX512, 1>, + SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, - avx512vl_i8_info, HasBWI>, + SSE_ALU_F32P, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, - avx512vl_i16_info, HasBWI>, + SSE_ALU_F32P, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, - avx512vl_i32_info, HasAVX512>, + SSE_ALU_F32P, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, - avx512vl_i64_info, HasAVX512>, + SSE_ALU_F32P, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; // Transforms to swizzle an immediate to help matching memory operand in first @@ -2089,7 +2114,7 @@ def CommutePCMPCC : SDNodeXForm; multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let isCommutable = 1 in def rri : AVX512AIi8 opc, string Suffix, SDNode OpNode, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rmi : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V; + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCommutable = 1 in def rrik : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc)))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rmik : AVX512AIi8 opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -2135,20 +2161,20 @@ multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in def rmi_alt : AVX512AIi8, EVEX_4V; + [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rrik_alt : AVX512AIi8, EVEX_4V, EVEX_K; + [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; let mayLoad = 1 in def rmik_alt : AVX512AIi8 opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + [], itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)), @@ -2172,8 +2199,8 @@ multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, } multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> : - avx512_icmp_cc { + OpndItins itins, X86VectorVTInfo _> : + avx512_icmp_cc { def rmib : AVX512AIi8 opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk : AVX512AIi8 opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { @@ -2204,14 +2233,16 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk_alt : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)), @@ -2228,49 +2259,61 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, } multiclass avx512_icmp_cc_vl opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc, EVEX_V512; + defm Z : avx512_icmp_cc, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc, EVEX_V256; - defm Z128 : avx512_icmp_cc, EVEX_V128; + defm Z256 : avx512_icmp_cc, + EVEX_V256; + defm Z128 : avx512_icmp_cc, + EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb, + defm Z : avx512_icmp_cc_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb, + defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb, + defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; } } -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; +// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU? +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; multiclass avx512_vcmp_common { @@ -4576,7 +4619,7 @@ let Predicates = [HasAVX512] in { // be set to null_frag for 32-bit elements. multiclass avx512_logic_rm opc, string OpcodeStr, SDPatternOperator OpNode, - SDNode OpNodeMsk, X86VectorVTInfo _, + SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _, bit IsCommutable = 0> { let hasSideEffects = 0 in defm rr : AVX512_maskable_logic opc, string OpcodeStr, (bitconvert (_.VT _.RC:$src2)))), (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, _.RC:$src2)))), - IIC_SSE_BIT_P_RR, IsCommutable>, - AVX512BIBase, EVEX_4V; + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched]>; let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable_logic opc, string OpcodeStr, (bitconvert (_.LdFrag addr:$src2)))), (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))))), - IIC_SSE_BIT_P_RM>, - AVX512BIBase, EVEX_4V; + itins.rm>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } // OpNodeMsk is the OpNode to use where element size is important. So use // for all of the broadcast patterns. multiclass avx512_logic_rmb opc, string OpcodeStr, SDPatternOperator OpNode, - SDNode OpNodeMsk, X86VectorVTInfo _, + SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _, bit IsCommutable = 0> : - avx512_logic_rm { + avx512_logic_rm { defm rmb : AVX512_maskable_logic opc, string OpcodeStr, (bitconvert (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))))), - IIC_SSE_BIT_P_RM>, - AVX512BIBase, EVEX_4V, EVEX_B; + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_logic_rmb_vl opc, string OpcodeStr, SDPatternOperator OpNode, - SDNode OpNodeMsk, AVX512VLVectorVTInfo VTInfo, + SDNode OpNodeMsk, OpndItins itins, + AVX512VLVectorVTInfo VTInfo, bit IsCommutable = 0> { let Predicates = [HasAVX512] in - defm Z : avx512_logic_rmb, EVEX_V512; + defm Z : avx512_logic_rmb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_logic_rmb, EVEX_V256; - defm Z128 : avx512_logic_rmb, EVEX_V128; } } multiclass avx512_logic_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, bit IsCommutable = 0> { - defm Q : avx512_logic_rmb_vl { + defm Q : avx512_logic_rmb_vl, VEX_W, EVEX_CD8<64, CD8VF>; - defm D : avx512_logic_rmb_vl, EVEX_CD8<32, CD8VF>; } -defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>; -defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>; -defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>; -defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>; +defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_BIT_ITINS_P, 1>; +defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_BIT_ITINS_P, 1>; +defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_BIT_ITINS_P, 1>; +defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_BIT_ITINS_P>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -4741,7 +4787,8 @@ multiclass avx512_fp_scalar_sae opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B, Sched<[itins.Sched]>; + (i32 FROUND_NO_EXC)), itins.rr>, EVEX_B, + Sched<[itins.Sched]>; } } @@ -6259,21 +6306,21 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, //===----------------------------------------------------------------------===// let Constraints = "$src1 = $dst" in { multiclass avx512_pmadd52_rm opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { // NOTE: The SDNode have the multiply operands first with the add last. // This enables commuted load patterns to be autogenerated by tablegen. let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), itins.rr, 1, 1>, + AVX512FMA3Base, Sched<[itins.Sched]>; defm m: AVX512_maskable_3src, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), + itins.rm>, AVX512FMA3Base, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb: AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), - _.RC:$src1)>, - AVX512FMA3Base, EVEX_B; + _.RC:$src1), itins.rm>, + AVX512FMA3Base, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } // Constraints = "$src1 = $dst" multiclass avx512_pmadd52_common opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasIFMA] in { - defm Z : avx512_pmadd52_rm, + defm Z : avx512_pmadd52_rm, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasIFMA] in { - defm Z256 : avx512_pmadd52_rm, + defm Z256 : avx512_pmadd52_rm, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_pmadd52_rm, + defm Z128 : avx512_pmadd52_rm, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l, - avx512vl_i64_info>, VEX_W; + SSE_PMADD, avx512vl_i64_info>, VEX_W; defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, - avx512vl_i64_info>, VEX_W; + SSE_PMADD, avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from sign integer to float/double //===----------------------------------------------------------------------===// -multiclass avx512_vcvtsi opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, X86MemOperand x86memop, - PatFrag ld_frag, string asm> { +multiclass avx512_vcvtsi opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { let hasSideEffects = 0 in { def rr : SI, - EVEX_4V; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [], + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in def rm : SI, - EVEX_4V; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [], + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // hasSideEffects = 0 let isCodeGenOnly = 1 in { def rr_Int : SI opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 FROUND_CURRENT)))]>, EVEX_4V; + (i32 FROUND_CURRENT)))], itins.rr>, + EVEX_4V, Sched<[itins.Sched]>; def rm_Int : SI opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), (ld_frag addr:$src2), - (i32 FROUND_CURRENT)))]>, EVEX_4V; + (i32 FROUND_CURRENT)))], itins.rm>, + EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; }//isCodeGenOnly = 1 } -multiclass avx512_vcvtsi_round opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, string asm> { +multiclass avx512_vcvtsi_round opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm> { def rrb_Int : SI opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC; + (i32 imm:$rc)))], itins.rr>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } -multiclass avx512_vcvtsi_common opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, X86MemOperand x86memop, - PatFrag ld_frag, string asm> { - defm NAME : avx512_vcvtsi_round, - avx512_vcvtsi, - VEX_LIG; +multiclass avx512_vcvtsi_common opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { + defm NAME : avx512_vcvtsi_round, + avx512_vcvtsi, VEX_LIG; } let Predicates = [HasAVX512] in { -defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR32, v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR64, v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, +defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, XD, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR64, v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -6400,16 +6450,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR32, v4f32x_info, i32mem, loadi32, "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR64, v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info, +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info, i32mem, loadi32, "cvtusi2sd{l}">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR64, v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -6440,51 +6490,54 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int_round opc, X86VectorVTInfo SrcVT , - X86VectorVTInfo DstVT, SDNode OpNode, string asm> { + +multiclass avx512_cvt_s_int_round opc, X86VectorVTInfo SrcVT, + X86VectorVTInfo DstVT, SDNode OpNode, + OpndItins itins, string asm> { let Predicates = [HasAVX512] in { def rr : SI, - EVEX, VEX_LIG; + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))], + itins.rr>, EVEX, VEX_LIG, Sched<[itins.Sched]>; def rb : SI, - EVEX, VEX_LIG, EVEX_B, EVEX_RC; + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))], + itins.rr>, EVEX, VEX_LIG, EVEX_B, EVEX_RC, + Sched<[itins.Sched]>; def rm : SI, - EVEX, VEX_LIG; + (i32 FROUND_CURRENT)))], itins.rm>, + EVEX, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // Predicates = [HasAVX512] } // Convert float/double to signed/unsigned int 32/64 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info, - X86cvts2si, "cvtss2si">, + X86cvts2si, SSE_CVT_SS2SI_32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, - X86cvts2si, "cvtss2si">, + X86cvts2si, SSE_CVT_SS2SI_64, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, - X86cvts2usi, "cvtss2usi">, + X86cvts2usi, SSE_CVT_SS2SI_32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, - X86cvts2usi, "cvtss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; + X86cvts2usi, SSE_CVT_SS2SI_64, "cvtss2usi">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, - X86cvts2si, "cvtsd2si">, + X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, - X86cvts2si, "cvtsd2si">, + X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, - X86cvts2usi, "cvtsd2usi">, + X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, - X86cvts2usi, "cvtsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; + X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; // The SSE version of these instructions are disabled for AVX512. // Therefore, the SSE intrinsics are mapped to the AVX512 instructions. @@ -6557,19 +6610,20 @@ def : Pat<(v2f64 (X86Movsd // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeRnd, string aliasStr>{ + SDNode OpNodeRnd, OpndItins itins, string aliasStr>{ let Predicates = [HasAVX512] in { def rr : AVX512, EVEX; + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))], itins.rr>, + EVEX, Sched<[itins.Sched]>; let hasSideEffects = 0 in def rb : AVX512, EVEX, EVEX_B; + [], itins.rr>, EVEX, EVEX_B, Sched<[itins.Sched]>; def rm : AVX512, - EVEX; + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))], + itins.rm>, EVEX, Sched<[itins.Sched.Folded, ReadAfterLd]>; def : InstAlias(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; @@ -6583,47 +6637,48 @@ let Predicates = [HasAVX512] in { def rr_Int : AVX512, EVEX, VEX_LIG; + (i32 FROUND_CURRENT)))], itins.rr>, + EVEX, VEX_LIG, Sched<[itins.Sched]>; def rb_Int : AVX512, - EVEX,VEX_LIG , EVEX_B; + (i32 FROUND_NO_EXC)))], itins.rr>, + EVEX,VEX_LIG , EVEX_B, Sched<[itins.Sched]>; let mayLoad = 1, hasSideEffects = 0 in def rm_Int : AVX512, EVEX, VEX_LIG; - + [], itins.rm>, EVEX, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } // isCodeGenOnly = 1 } //HasAVX512 } defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2IntRnd, "{l}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_32, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2IntRnd, "{q}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_64, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2IntRnd, "{l}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2IntRnd, "{q}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UIntRnd, "{l}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_32, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UIntRnd, "{q}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_64, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UIntRnd, "{l}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UIntRnd, "{q}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), @@ -6643,87 +6698,92 @@ let Predicates = [HasAVX512] in { def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)), (VCVTTSD2SI64Zrm_Int sdmem:$src)>; } // HasAVX512 + //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// + multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNode> { + X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins> { defm rr_Int : AVX512_maskable_scalar, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + (i32 FROUND_CURRENT))), itins.rr>, + EVEX_4V, VEX_LIG, Sched<[itins.Sched]>; defm rm_Int : AVX512_maskable_scalar, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + (i32 FROUND_CURRENT))), itins.rm>, + EVEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1, hasSideEffects = 0 in { def rr : I, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rr>, EVEX_4V, VEX_LIG, Sched<[itins.Sched]>; let mayLoad = 1 in def rm : I, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rm>, EVEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> { defm rrb_Int : AVX512_maskable_scalar, - EVEX_4V, VEX_LIG, EVEX_B; + (i32 FROUND_NO_EXC))), itins.rr>, + EVEX_4V, VEX_LIG, EVEX_B, Sched<[itins.Sched]>; } // Scalar Conversion with rounding control (RC) multiclass avx512_cvt_fp_rc_scalar opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> { defm rrb_Int : AVX512_maskable_scalar, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + (_Src.VT _Src.RC:$src2), (i32 imm:$rc))), + itins.rm>, + EVEX_4V, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>, EVEX_B, EVEX_RC; } multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, - SDNode OpNodeRnd, X86VectorVTInfo _src, - X86VectorVTInfo _dst> { + SDNode OpNodeRnd, OpndItins itins, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar, + defm Z : avx512_cvt_fp_scalar, avx512_cvt_fp_rc_scalar, VEX_W, EVEX_CD8<64, CD8VT1>, XD; + OpNodeRnd, itins>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; } } multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, - SDNode OpNodeRnd, X86VectorVTInfo _src, - X86VectorVTInfo _dst> { + SDNode OpNodeRnd, OpndItins itins, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar, - avx512_cvt_fp_sae_scalar, + defm Z : avx512_cvt_fp_scalar, + avx512_cvt_fp_sae_scalar, EVEX_CD8<32, CD8VT1>, XS; } } defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", - X86froundRnd, f64x_info, f32x_info>, - NotMemoryFoldable; + X86froundRnd, SSE_CVT_SD2SS, f64x_info, + f32x_info>, NotMemoryFoldable; defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", - X86fpextRnd,f32x_info, f64x_info >, - NotMemoryFoldable; + X86fpextRnd, SSE_CVT_SS2SD, f32x_info, + f64x_info>, NotMemoryFoldable; def : Pat<(f64 (fpextend FR32X:$src)), (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, @@ -7505,53 +7565,53 @@ let Predicates = [HasVLX] in { // Unordered/Ordered scalar fp compare with Sea and set EFLAGS multiclass avx512_ord_cmp_sae opc, X86VectorVTInfo _, - string OpcodeStr> { + string OpcodeStr, OpndItins itins> { let hasSideEffects = 0 in def rb: AVX512, EVEX, EVEX_B, VEX_LIG, EVEX_V128, - Sched<[WriteFAdd]>; + [], itins.rr>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[itins.Sched]>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { - defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">, + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSE_COMIS>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">, + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSE_COMIS>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; - defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">, + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSE_COMIS>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">, + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSE_COMIS>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS, EVEX, VEX_LIG, + "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD, EVEX, + "ucomisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; let Pattern = [] in { defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, - "comiss">, PS, EVEX, VEX_LIG, + "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, - "comisd">, PD, EVEX, + "comisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } let isCodeGenOnly = 1 in { defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG, + sse_load_f32, "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD, EVEX, + sse_load_f64, "ucomisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS, EVEX, VEX_LIG, + sse_load_f32, "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD, EVEX, + sse_load_f64, "comisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } @@ -7967,26 +8027,36 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", SSE_ALU_F64S, // Integer truncate and extend operations //------------------------------------------------- +let Sched = WriteShuffle256 in +def AVX512_EXTEND : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + +let Sched = WriteShuffle256 in +def AVX512_TRUNCATE : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + multiclass avx512_trunc_common opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, - X86MemOperand x86memop> { + OpndItins itins, X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, X86MemOperand x86memop> { let ExeDomain = DestInfo.ExeDomain in defm rr : AVX512_maskable, - EVEX, T8XS; + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + itins.rr>, EVEX, T8XS, Sched<[itins.Sched]>; let mayStore = 1, mayLoad = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in { def mr : AVX512XS8I, EVEX; + [], itins.rm>, EVEX, Sched<[itins.Sched.Folded]>; def mrk : AVX512XS8I, EVEX, EVEX_K; + [], itins.rm>, EVEX, EVEX_K, Sched<[itins.Sched.Folded]>; }//mayStore = 1, mayLoad = 1, hasSideEffects = 0 } @@ -8005,112 +8075,118 @@ multiclass avx512_trunc_mr_lowering opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + OpndItins itins, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, Predicate prd = HasAVX512>{ let Predicates = [HasVLX, prd] in { - defm Z128: avx512_trunc_common, + defm Z128: avx512_trunc_common, avx512_trunc_mr_lowering, EVEX_V128; - defm Z256: avx512_trunc_common, + defm Z256: avx512_trunc_common, avx512_trunc_mr_lowering, EVEX_V256; } let Predicates = [prd] in - defm Z: avx512_trunc_common, + defm Z: avx512_trunc_common, avx512_trunc_mr_lowering, EVEX_V512; } multiclass avx512_trunc_qb opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VO>; } multiclass avx512_trunc_qw opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VQ>; } multiclass avx512_trunc_qd opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<32, CD8VH>; } multiclass avx512_trunc_db opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VQ>; } multiclass avx512_trunc_dw opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; } multiclass avx512_trunc_wb opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; } -defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, +defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, +defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; -defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, AVX512_TRUNCATE, truncstorevi16, masked_truncstorevi16>; -defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, +defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi16, masked_truncstore_s_vi16>; -defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, +defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi16, masked_truncstore_us_vi16>; -defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, AVX512_TRUNCATE, truncstorevi32, masked_truncstorevi32>; -defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, +defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi32, masked_truncstore_s_vi32>; -defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, +defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi32, masked_truncstore_us_vi32>; -defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, +defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, +defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; -defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, AVX512_TRUNCATE, truncstorevi16, masked_truncstorevi16>; -defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, +defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi16, masked_truncstore_s_vi16>; -defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, +defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi16, masked_truncstore_us_vi16>; -defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, +defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, +defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; let Predicates = [HasAVX512, NoVLX] in { @@ -8130,150 +8206,150 @@ def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), VR256X:$src, sub_ymm))), sub_xmm))>; } -multiclass avx512_extend_common opc, string OpcodeStr, +multiclass avx512_extend_common opc, string OpcodeStr, OpndItins itins, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{ let ExeDomain = DestInfo.ExeDomain in { defm rr : AVX512_maskable, - EVEX; + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))), itins.rr>, + EVEX, Sched<[itins.Sched]>; defm rm : AVX512_maskable, - EVEX; + (DestInfo.VT (LdFrag addr:$src)), itins.rm>, + EVEX, Sched<[itins.Sched.Folded]>; } } multiclass avx512_extend_BW opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasBWI] in { - defm Z128: avx512_extend_common, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasBWI] in { - defm Z : avx512_extend_common, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_BD opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_BQ opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_WD opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_WQ opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_DQ opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi32")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast(ExtTy#"extloadvi32")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; - defm Z256: avx512_extend_common, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; } } -defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">; -defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">; -defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">; -defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">; -defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">; -defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">; +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", AVX512_EXTEND>; -defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">; -defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">; -defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">; -defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">; -defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">; -defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">; +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", AVX512_EXTEND>; multiclass AVX512_pmovx_patterns; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations +// FIXME: Improve scheduling of gather/scatter instructions. multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, X86MemOperand memop, PatFrag GatherNode, RegisterClass MaskRC = _.KRCWM> { @@ -8435,7 +8512,7 @@ multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, [(set _.RC:$dst, MaskRC:$mask_wb, (GatherNode (_.VT _.RC:$src1), MaskRC:$mask, vectoraddr:$src2))]>, EVEX, EVEX_K, - EVEX_CD8<_.EltSize, CD8VT1>; + EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; } multiclass avx512_gather_q_pd dopc, bits<8> qopc, @@ -8493,7 +8570,8 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), _.KRCWM:$mask, vectoraddr:$dst))]>, - EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteStore]>; } multiclass avx512_scatter_q_pd dopc, bits<8> qopc, @@ -8544,7 +8622,7 @@ multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeSt let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I, EVEX, EVEX_K; + [], IIC_SSE_PREFETCH>, EVEX, EVEX_K, Sched<[WriteLoad]>; } defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", @@ -9773,71 +9851,85 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, // AVX-512 - Byte shift Left/Right //===----------------------------------------------------------------------===// +let Sched = WriteVecShift in +def AVX512_BYTESHIFT : OpndItins< + IIC_SSE_INTSHDQ_P_RI, IIC_SSE_INTSHDQ_P_RI +>; + multiclass avx512_shift_packed opc, SDNode OpNode, Format MRMr, - Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + Format MRMm, string OpcodeStr, + OpndItins itins, X86VectorVTInfo _>{ def rr : AVX512; + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))], + itins.rr>, Sched<[itins.Sched]>; def rm : AVX512; + (i8 imm:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, - Format MRMm, string OpcodeStr, Predicate prd>{ + Format MRMm, string OpcodeStr, + OpndItins itins, Predicate prd>{ let Predicates = [prd] in defm Z512 : avx512_shift_packed, EVEX_V512; + OpcodeStr, itins, v64i8_info>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_shift_packed, EVEX_V256; + OpcodeStr, itins, v32i8x_info>, EVEX_V256; defm Z128 : avx512_shift_packed, EVEX_V128; + OpcodeStr, itins, v16i8x_info>, EVEX_V128; } } defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", - HasBWI>, AVX512PDIi8Base, EVEX_4V, VEX_WIG; + AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base, + EVEX_4V, VEX_WIG; defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", - HasBWI>, AVX512PDIi8Base, EVEX_4V, VEX_WIG; + AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base, + EVEX_4V, VEX_WIG; multiclass avx512_psadbw_packed opc, SDNode OpNode, - string OpcodeStr, X86VectorVTInfo _dst, - X86VectorVTInfo _src>{ + string OpcodeStr, OpndItins itins, + X86VectorVTInfo _dst, X86VectorVTInfo _src> { def rr : AVX512BI; + (_src.VT _src.RC:$src2))))], itins.rr>, + Sched<[itins.Sched]>; def rm : AVX512BI; + (_src.LdFrag addr:$src2))))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_psadbw_packed_all opc, SDNode OpNode, - string OpcodeStr, Predicate prd> { + string OpcodeStr, OpndItins itins, + Predicate prd> { let Predicates = [prd] in - defm Z512 : avx512_psadbw_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_psadbw_packed, EVEX_V256; - defm Z128 : avx512_psadbw_packed, EVEX_V128; } } defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", - HasBWI>, EVEX_4V, VEX_WIG; + SSE_MPSADBW_ITINS, HasBWI>, EVEX_4V, VEX_WIG; // Transforms to swizzle an immediate to enable better matching when // memory operand isn't in the right place. @@ -10089,7 +10181,7 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SSE_INTALU_ITINS_P, //===----------------------------------------------------------------------===// multiclass avx512_fixupimm_packed opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + OpndItins itins, X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT _.RC:$src3), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>; defm rmi : AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT (bitconvert (_.LdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>, EVEX_B; + (i32 FROUND_CURRENT)), itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // Constraints = "$src1 = $dst" } multiclass avx512_fixupimm_packed_sae opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ + SDNode OpNode, OpndItins itins, + X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } } multiclass avx512_fixupimm_scalar opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, X86VectorVTInfo _src3VT> { + OpndItins itins, X86VectorVTInfo _, + X86VectorVTInfo _src3VT> { let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src_scalar opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; - + (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>; defm rrib : AVX512_maskable_3src_scalar opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), (i32 imm:$src4), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmi : AVX512_maskable_3src_scalar opc, string OpcodeStr, SDNode OpNode, (_src3VT.VT (scalar_to_vector (_src3VT.ScalarLdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_fixupimm_packed_all{ +multiclass avx512_fixupimm_packed_all { let Predicates = [HasAVX512] in - defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>, - avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>, - AVX512AIi8Base, EVEX_4V, EVEX_V512; + defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info512>, + avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>, - AVX512AIi8Base, EVEX_4V, EVEX_V128; - defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>, - AVX512AIi8Base, EVEX_4V, EVEX_V256; + defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128; + defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256; } } defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f32x_info, v4i32x_info>, + SSE_ALU_F32S, f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f64x_info, v2i64x_info>, + SSE_ALU_F64S, f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; -defm VFIXUPIMMPS : avx512_fixupimm_packed_all, +defm VFIXUPIMMPS : avx512_fixupimm_packed_all, EVEX_CD8<32, CD8VF>; -defm VFIXUPIMMPD : avx512_fixupimm_packed_all, +defm VFIXUPIMMPD : avx512_fixupimm_packed_all, EVEX_CD8<64, CD8VF>, VEX_W; @@ -10450,43 +10549,46 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND, let Constraints = "$src1 = $dst" in multiclass VNNI_rmb Op, string OpStr, SDNode OpNode, - X86VectorVTInfo VTI> { + OpndItins itins, X86VectorVTInfo VTI> { defm r : AVX512_maskable_3src, - EVEX_4V, T8PD; + VTI.RC:$src2, VTI.RC:$src3)), + itins.rr>, EVEX_4V, T8PD, Sched<[itins.Sched]>; defm m : AVX512_maskable_3src, - EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD; + (VTI.LdFrag addr:$src3))))), + itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb : AVX512_maskable_3src, - EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD; + (VTI.ScalarLdFrag addr:$src3)))), + itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, + T8PD, Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass VNNI_common Op, string OpStr, SDNode OpNode> { +multiclass VNNI_common Op, string OpStr, SDNode OpNode, OpndItins itins> { let Predicates = [HasVNNI] in - defm Z : VNNI_rmb, EVEX_V512; + defm Z : VNNI_rmb, EVEX_V512; let Predicates = [HasVNNI, HasVLX] in { - defm Z256 : VNNI_rmb, EVEX_V256; - defm Z128 : VNNI_rmb, EVEX_V128; + defm Z256 : VNNI_rmb, EVEX_V256; + defm Z128 : VNNI_rmb, EVEX_V128; } } -defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd>; -defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds>; -defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd>; -defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds>; +// FIXME: Is there a better scheduler itinerary for VPDP? +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SSE_PMADD>; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SSE_PMADD>; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SSE_PMADD>; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>; //===----------------------------------------------------------------------===// // Bit Algorithms diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 82885687bb42c..0b63f3763021f 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -142,6 +142,15 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), [(X86WinAlloca GR64:$size)]>, Requires<[In64BitMode]>; +// These instructions XOR the frame pointer into a GPR. They are used in some +// stack protection schemes. These are post-RA pseudos because we only know the +// frame register after register allocation. +let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in { + def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src), + "xorl\t$$FP, $src", []>, Requires<[NotLP64]>; + def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src), + "xorq\t$$FP $src", []>, Requires<[In64BitMode]>; +} //===----------------------------------------------------------------------===// // EH Pseudo Instructions diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 2165d75869068..6788ba50c96aa 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -356,28 +356,31 @@ def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; // Floating point cmovs. -class FpIf32CMov pattern> : - FpI_, Requires<[FPStackf32, HasCMov]>; -class FpIf64CMov pattern> : - FpI_, Requires<[FPStackf64, HasCMov]>; +class FpIf32CMov pattern, + InstrItinClass itin> : + FpI_, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov pattern, + InstrItinClass itin> : + FpI_, Requires<[FPStackf64, HasCMov]>; multiclass FPCMov { def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), CondMovFP, [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, - cc, EFLAGS))]>; + cc, EFLAGS))], IIC_FCMOV>; def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), CondMovFP, [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, - cc, EFLAGS))]>; + cc, EFLAGS))], IIC_FCMOV>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), CondMovFP, [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, - cc, EFLAGS))]>, + cc, EFLAGS))], IIC_FCMOV>, Requires<[HasCMov]>; } let Defs = [FPSW] in { +let SchedRW = [WriteFAdd] in { let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { defm CMOVB : FPCMov; defm CMOVBE : FPCMov; @@ -392,22 +395,23 @@ defm CMOVNP : FPCMov; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), - "fcmovb\t{$op, %st(0)|st(0), $op}">; + "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), - "fcmovbe\t{$op, %st(0)|st(0), $op}">; + "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), - "fcmove\t{$op, %st(0)|st(0), $op}">; + "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), - "fcmovu\t{$op, %st(0)|st(0), $op}">; + "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), - "fcmovnb\t{$op, %st(0)|st(0), $op}">; + "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), - "fcmovnbe\t{$op, %st(0)|st(0), $op}">; + "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), - "fcmovne\t{$op, %st(0)|st(0), $op}">; + "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), - "fcmovnu\t{$op, %st(0)|st(0), $op}">; + "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; } // Predicates = [HasCMov] +} // SchedRW // Floating point loads & stores. let canFoldAsLoad = 1 in { diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index a5bff06e70b5b..96f19d35815e3 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -7762,6 +7762,18 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } +static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { + MachineBasicBlock &MBB = *MIB->getParent(); + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &Subtarget = MF.getSubtarget(); + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + unsigned XorOp = + MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr; + MIB->setDesc(TII.get(XorOp)); + MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef); + return true; +} + // This is used to handle spills for 128/256-bit registers when we have AVX512, // but not VLX. If it uses an extended register we need to use an instruction // that loads the lower 128/256-bit, but is available with only AVX512F. @@ -7956,6 +7968,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; + case X86::XOR64_FP: + case X86::XOR32_FP: + return expandXorFP(MIB, *this); } return false; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ea30393242d72..d3a61f7b0e1cb 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2135,6 +2135,11 @@ let Predicates = [UseSSE2] in { // SSE 1 & 2 - Compare Instructions //===----------------------------------------------------------------------===// +let Sched = WriteFAdd in +def SSE_COMIS : OpndItins< + IIC_SSE_COMIS_RR, IIC_SSE_COMIS_RM +>; + // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, - PatFrag ld_frag, string OpcodeStr> { + PatFrag ld_frag, string OpcodeStr, + OpndItins itins> { let hasSideEffects = 0 in { def rr: SI, - Sched<[WriteFAdd]>; + itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in def rm: SI, - Sched<[WriteFAddLd, ReadAfterLd]>; + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp multiclass sse12_ord_cmp_int opc, RegisterClass RC, SDNode OpNode, ValueType vt, Operand memop, - ComplexPattern mem_cpat, string OpcodeStr> { + ComplexPattern mem_cpat, string OpcodeStr, + OpndItins itins> { def rr: SI, - Sched<[WriteFAdd]>; + itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in def rm: SI, - Sched<[WriteFAddLd, ReadAfterLd]>; + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG; + "ucomiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG; + "ucomisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG; let Pattern = [] in { defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss">, PS, VEX, VEX_LIG, VEX_WIG; + "comiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd">, PD, VEX, VEX_LIG, VEX_WIG; + "comisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG; } let isCodeGenOnly = 1 in { defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG; + sse_load_f32, "ucomiss", SSE_COMIS>, PS, VEX, VEX_WIG; defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG; + sse_load_f64, "ucomisd", SSE_COMIS>, PD, VEX, VEX_WIG; defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS, VEX, VEX_WIG; + sse_load_f32, "comiss", SSE_COMIS>, PS, VEX, VEX_WIG; defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD, VEX, VEX_WIG; + sse_load_f64, "comisd", SSE_COMIS>, PD, VEX, VEX_WIG; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS; + "ucomiss", SSE_COMIS>, PS; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD; + "ucomisd", SSE_COMIS>, PD; let Pattern = [] in { defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss">, PS; + "comiss", SSE_COMIS>, PS; defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd">, PD; + "comisd", SSE_COMIS>, PD; } let isCodeGenOnly = 1 in { defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS; + sse_load_f32, "ucomiss", SSE_COMIS>, PS; defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD; + sse_load_f64, "ucomisd", SSE_COMIS>, PD; defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS; + sse_load_f32, "comiss", SSE_COMIS>, PS; defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD; + sse_load_f64, "comisd", SSE_COMIS>, PD; } } // Defs = [EFLAGS] @@ -8435,10 +8442,10 @@ let Predicates = [HasAVX2, NoVLX] in { (VPSRAVDYrm VR256:$src1, addr:$src2)>; } - - //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations + +// FIXME: Improve scheduling of gather instructions. multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, ValueType VTy, PatFrag GatherNode128, PatFrag GatherNode256, RegisterClass RC256, @@ -8450,14 +8457,16 @@ multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), (GatherNode128 VR128:$src1, VR128:$mask, - vectoraddr:$src2))]>, VEX; + vectoraddr:$src2))]>, + VEX, Sched<[WriteLoad]>; def Yrm : AVX28I, VEX, VEX_L; + vectoraddr:$src2))]>, + VEX, VEX_L, Sched<[WriteLoad]>; } let Predicates = [UseAVX2] in { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index f6530c4eee2fd..44bbc3f1b3fa9 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -484,6 +484,11 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); if (NewOpc == Opc) return false; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index fae0889950b25..7821971b4a2bb 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -479,9 +479,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), - X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index c43ae0b17ca02..f6c3649c159c1 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -447,6 +447,7 @@ def IIC_CMPX_LOCK_16B : InstrItinClass; def IIC_XADD_LOCK_MEM : InstrItinClass; def IIC_XADD_LOCK_MEM8 : InstrItinClass; +def IIC_FCMOV : InstrItinClass; def IIC_FILD : InstrItinClass; def IIC_FLD : InstrItinClass; def IIC_FLD80 : InstrItinClass; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index a0821994214d9..31424966336a3 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -364,6 +364,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData] >, InstrItinData] >, + InstrItinData] >, InstrItinData] >, InstrItinData] >, InstrItinData] >, diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 5ebe8a28422ea..2bae818cfcd7a 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -761,7 +761,7 @@ def : InstRW<[ZnWriteFPU3], (instregex "LD_F1")>; // FLDPI FLDL2E etc. def : InstRW<[ZnWriteFPU3], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; -def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; +def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|E|P|NB|NBE|NE|NP)_F")>; // FNSTSW. // AX. diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index f9850619f9638..5352e32479bb9 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -130,17 +130,18 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, SCCNodes.count(CS.getCalledFunction())) continue; FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS); + ModRefInfo MRI = createModRefInfo(MRB); // If the call doesn't access memory, we're done. - if (!(MRB & MRI_ModRef)) + if (isNoModRef(MRI)) continue; if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { // The call could access any memory. If that includes writes, give up. - if (MRB & MRI_Mod) + if (isModSet(MRI)) return MAK_MayWrite; // If it reads, note it. - if (MRB & MRI_Ref) + if (isRefSet(MRI)) ReadsMemory = true; continue; } @@ -162,10 +163,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; - if (MRB & MRI_Mod) + if (isModSet(MRI)) // Writes non-local memory. Give up. return MAK_MayWrite; - if (MRB & MRI_Ref) + if (isRefSet(MRI)) // Ok, it reads non-local memory. ReadsMemory = true; } diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp index e70e7591f6a70..c4e61218f3f32 100644 --- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -248,7 +248,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, // Ok, now we know we have not seen a store yet. See if Inst can write to // our load location, if it can not, just ignore the instruction. - if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod)) + if (!isModSet(AA->getModRefInfo(Inst, Loc))) continue; Store = dyn_cast(Inst); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 040e0f59c61a6..8f468ebf89491 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -329,13 +329,15 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { // See if we can prove that the given overflow intrinsic will not overflow. static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) { using OBO = OverflowingBinaryOperator; - auto NoWrapOnAddition = [&] (Value *LHS, Value *RHS, unsigned NoWrapKind) { + auto NoWrap = [&] (Instruction::BinaryOps BinOp, unsigned NoWrapKind) { + Value *RHS = II->getOperand(1); ConstantRange RRange = LVI->getConstantRange(RHS, II->getParent(), II); ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( - BinaryOperator::Add, RRange, NoWrapKind); + BinOp, RRange, NoWrapKind); // As an optimization, do not compute LRange if we do not need it. if (NWRegion.isEmptySet()) return false; + Value *LHS = II->getOperand(0); ConstantRange LRange = LVI->getConstantRange(LHS, II->getParent(), II); return NWRegion.contains(LRange); }; @@ -343,11 +345,13 @@ static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) { default: break; case Intrinsic::uadd_with_overflow: - return NoWrapOnAddition(II->getOperand(0), II->getOperand(1), - OBO::NoUnsignedWrap); + return NoWrap(Instruction::Add, OBO::NoUnsignedWrap); case Intrinsic::sadd_with_overflow: - return NoWrapOnAddition(II->getOperand(0), II->getOperand(1), - OBO::NoSignedWrap); + return NoWrap(Instruction::Add, OBO::NoSignedWrap); + case Intrinsic::usub_with_overflow: + return NoWrap(Instruction::Sub, OBO::NoUnsignedWrap); + case Intrinsic::ssub_with_overflow: + return NoWrap(Instruction::Sub, OBO::NoSignedWrap); } return false; } @@ -356,12 +360,17 @@ static void processOverflowIntrinsic(IntrinsicInst *II) { Value *NewOp = nullptr; switch (II->getIntrinsicID()) { default: - llvm_unreachable("Illegal instruction."); + llvm_unreachable("Unexpected instruction."); case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1), II->getName(), II); break; + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1), + II->getName(), II); + break; } ++NumOverflows; IRBuilder<> B(II); @@ -376,7 +385,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { SmallVector ArgNos; unsigned ArgNo = 0; - if (IntrinsicInst *II = dyn_cast(CS.getInstruction())) { + if (auto *II = dyn_cast(CS.getInstruction())) { if (willNotOverflow(II, LVI)) { processOverflowIntrinsic(II); return true; @@ -556,7 +565,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { // blocks before querying later blocks (which require us to analyze early // blocks). Eagerly simplifying shallow blocks means there is strictly less // work to do for deep blocks. This also means we don't visit unreachable - // blocks. + // blocks. for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { bool BBChanged = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 877050ec17718..e703014bb0e61 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -594,11 +594,9 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI, } for (; BI != EI; ++BI) { Instruction *I = &*BI; - if (I->mayWriteToMemory() && I != SecondI) { - auto Res = AA->getModRefInfo(I, MemLoc); - if (Res & MRI_Mod) + if (I->mayWriteToMemory() && I != SecondI) + if (isModSet(AA->getModRefInfo(I, MemLoc))) return false; - } } if (B != FirstBB) { assert(B != &FirstBB->getParent()->getEntryBlock() && @@ -822,9 +820,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); - - return A == MRI_ModRef || A == MRI_Ref; + return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI))); }); // If all of the allocas were clobbered by the call then we're not going @@ -1255,7 +1251,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) + if (isRefSet(AA->getModRefInfo(DepWrite, Loc))) break; InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false, diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 3e331cddb4f2c..052ead8df3102 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -788,7 +788,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, ++BI) for (Instruction &I : **BI) if (IgnoredStores.count(&I) == 0 && - (AA.getModRefInfo(&I, StoreLoc) & Access)) + intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)) return true; return false; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 86d7b5e8ddd5b..cd3e4ba88bca0 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -518,7 +518,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, const LoadInst *LI) { // If the store alias this position, early bail out. MemoryLocation StoreLoc = MemoryLocation::get(SI); - if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc))) return false; // Keep track of the arguments of all instruction we plan to lift @@ -542,20 +542,20 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { auto *C = &*I; - bool MayAlias = AA.getModRefInfo(C, None) != MRI_NoModRef; + bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None)); bool NeedLift = false; if (Args.erase(C)) NeedLift = true; else if (MayAlias) { NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) { - return AA.getModRefInfo(C, ML); + return isModOrRefSet(AA.getModRefInfo(C, ML)); }); if (!NeedLift) NeedLift = llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) { - return AA.getModRefInfo(C, CS); + return isModOrRefSet(AA.getModRefInfo(C, CS)); }); } @@ -565,18 +565,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, if (MayAlias) { // Since LI is implicitly moved downwards past the lifted instructions, // none of them may modify its source. - if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod) + if (isModSet(AA.getModRefInfo(C, LoadLoc))) return false; else if (auto CS = ImmutableCallSite(C)) { // If we can't lift this before P, it's game over. - if (AA.getModRefInfo(P, CS) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, CS))) return false; CallSites.push_back(CS); } else if (isa(C) || isa(C) || isa(C)) { // If we can't lift this before P, it's game over. auto ML = MemoryLocation::get(C); - if (AA.getModRefInfo(P, ML) != MRI_NoModRef) + if (isModOrRefSet(AA.getModRefInfo(P, ML))) return false; MemLocs.push_back(ML); @@ -631,7 +631,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // of at the store position. Instruction *P = SI; for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { - if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) { + if (isModSet(AA.getModRefInfo(&I, LoadLoc))) { P = &I; break; } @@ -702,7 +702,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { - if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { + if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) { C = nullptr; break; } @@ -934,9 +934,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, AliasAnalysis &AA = LookupAliasAnalysis(); ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. - if (MR != MRI_NoModRef) + if (isModOrRefSet(MR)) MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); - if (MR != MRI_NoModRef) + if (isModOrRefSet(MR)) return false; // We can't create address space casts here because we don't know if they're diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 5210f165b8742..cfb8a062299f9 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -68,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, if (LoadInst *L = dyn_cast(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA.getModRefInfo(S, Loc) & MRI_Mod) + if (isModSet(AA.getModRefInfo(S, Loc))) return false; } @@ -83,7 +83,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, return false; for (Instruction *S : Stores) - if (AA.getModRefInfo(S, CS) & MRI_Mod) + if (isModSet(AA.getModRefInfo(S, CS))) return false; } diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index f5aa47f927e19..a8782e05851f2 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -332,7 +332,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { // Writes to memory only matter if they may alias the pointer // being loaded from. const DataLayout &DL = L->getModule()->getDataLayout(); - if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) || + if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getAlignment(), DL, L)) return false; diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 8f1626a149a5b..394c951630cff 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5174,7 +5174,7 @@ static void reuseTableCompare( for (auto ValuePair : Values) { Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(), ValuePair.second, CmpOp1, true); - if (!CaseConst || CaseConst == DefaultConst) + if (!CaseConst || CaseConst == DefaultConst || isa(CaseConst)) return; assert((CaseConst == TrueConst || CaseConst == FalseConst) && "Expect true or false as compare result."); diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index afa003c4dd974..aa81c3aff8971 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -40,10 +40,10 @@ define [1 x double] @constant() { ret [1 x double] [double 1.0] } -; The key problem here is that we may fail to create an MBB referenced by a -; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things -; happen. -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate constant: [1 x double] (in function: pending_phis) + ; The key problem here is that we may fail to create an MBB referenced by a + ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things + ; happen. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %6, %2; mem:ST4[%addr] GPR:%6,%2 (in function: pending_phis) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis: define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) { @@ -54,7 +54,7 @@ end: ret i32 %res true: - %t = extractvalue [1 x double] [double 1.0], 0 + store atomic i32 42, i32* %addr seq_cst, align 4 br label %end false: @@ -90,6 +90,16 @@ define i128 @sequence_sizes([8 x i8] %in) { ret i128 undef } +; Just to make sure we don't accidentally emit a normal load/store. +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %2(s64) = G_LOAD %0; mem:LD8[%addr] GPR:%2,%0 (in function: atomic_ops) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for atomic_ops +; FALLBACK-WITH-REPORT-LABEL: atomic_ops: +define i64 @atomic_ops(i64* %addr) { + store atomic i64 0, i64* %addr unordered, align 8 + %res = load atomic i64, i64* %addr seq_cst, align 8 + ret i64 %res +} + ; Make sure we don't mess up metadata arguments. declare void @llvm.write_register.i64(metadata, i64) diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 62203213a78cf..e786832797543 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1332,12 +1332,12 @@ define void @test_lifetime_intrin() { define void @test_load_store_atomics(i8* %addr) { ; CHECK-LABEL: name: test_load_store_atomics ; CHECK: [[ADDR:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[V0:%[0-9]+]]:_(s8) = G_ATOMIC_LOAD [[ADDR]](p0) :: (load unordered 1 from %ir.addr) -; CHECK: G_ATOMIC_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr) -; CHECK: [[V1:%[0-9]+]]:_(s8) = G_ATOMIC_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr) -; CHECK: G_ATOMIC_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr) -; CHECK: [[V2:%[0-9]+]]:_(s8) = G_ATOMIC_LOAD [[ADDR]](p0) :: (load syncscope("singlethread") seq_cst 1 from %ir.addr) -; CHECK: G_ATOMIC_STORE [[V2]](s8), [[ADDR]](p0) :: (store syncscope("singlethread") monotonic 1 into %ir.addr) +; CHECK: [[V0:%[0-9]+]]:_(s8) = G_LOAD [[ADDR]](p0) :: (load unordered 1 from %ir.addr) +; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr) +; CHECK: [[V1:%[0-9]+]]:_(s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr) +; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr) +; CHECK: [[V2:%[0-9]+]]:_(s8) = G_LOAD [[ADDR]](p0) :: (load syncscope("singlethread") seq_cst 1 from %ir.addr) +; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store syncscope("singlethread") monotonic 1 into %ir.addr) %v0 = load atomic i8, i8* %addr unordered, align 1 store atomic i8 %v0, i8* %addr monotonic, align 1 diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-atomic-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-atomic-load-store.mir deleted file mode 100644 index da95f6b302e56..0000000000000 --- a/test/CodeGen/AArch64/GlobalISel/legalize-atomic-load-store.mir +++ /dev/null @@ -1,91 +0,0 @@ -# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s - ---- | - target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - target triple = "aarch64--" - define void @test_load(i8* %addr) { - entry: - ret void - } - define void @test_store(i8* %addr) { - entry: - ret void - } -... - ---- -name: test_load -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } - - { id: 6, class: _ } -body: | - bb.0.entry: - liveins: %x0, %x1, %x2, %x3 - ; CHECK-LABEL: name: test_load - %0(p0) = COPY %x0 - - ; CHECK: %1:_(s8) = G_ATOMIC_LOAD %0(p0) :: (load unordered 1 from %ir.addr) - %1(s8) = G_ATOMIC_LOAD %0 :: (load unordered 1 from %ir.addr) - %10:_(s32) = G_ANYEXT %1 - %w0 = COPY %10 - - ; CHECK: %2:_(s16) = G_ATOMIC_LOAD %0(p0) :: (load unordered 2 from %ir.addr) - %2(s16) = G_ATOMIC_LOAD %0 :: (load unordered 2 from %ir.addr) - %11:_(s32) = G_ANYEXT %2 - %w0 = COPY %11 - - ; CHECK: %3:_(s32) = G_ATOMIC_LOAD %0(p0) :: (load unordered 4 from %ir.addr) - %3(s32) = G_ATOMIC_LOAD %0 :: (load unordered 4 from %ir.addr) - %w0 = COPY %3 - - ; CHECK: %4:_(s64) = G_ATOMIC_LOAD %0(p0) :: (load unordered 8 from %ir.addr) - %4(s64) = G_ATOMIC_LOAD %0 :: (load unordered 8 from %ir.addr) - %x0 = COPY %4 - - %5(p0) = G_ATOMIC_LOAD %0(p0) :: (load unordered 8 from %ir.addr) - %12:_(s64) = G_PTRTOINT %5 - %x0 = COPY %12 -... - ---- -name: test_store -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } - - { id: 5, class: _ } - - { id: 6, class: _ } - - { id: 7, class: _ } -body: | - bb.0.entry: - liveins: %x0, %x1, %x2, %x3 - ; CHECK-LABEL: name: test_store - - %0(p0) = COPY %x0 - %1(s32) = COPY %w1 - - ; CHECK: G_ATOMIC_STORE %2(s8), %0(p0) :: (store unordered 1 into %ir.addr) - %2(s8) = G_TRUNC %1 - G_ATOMIC_STORE %2, %0 :: (store unordered 1 into %ir.addr) - - ; CHECK: G_ATOMIC_STORE %3(s16), %0(p0) :: (store unordered 2 into %ir.addr) - %3(s16) = G_TRUNC %1 - G_ATOMIC_STORE %3, %0 :: (store unordered 2 into %ir.addr) - - ; CHECK: G_ATOMIC_STORE %1(s32), %0(p0) :: (store unordered 4 into %ir.addr) - G_ATOMIC_STORE %1, %0 :: (store unordered 4 into %ir.addr) - - ; CHECK: G_ATOMIC_STORE %4(s64), %0(p0) :: (store unordered 8 into %ir.addr) - %4(s64) = G_PTRTOINT %0(p0) - G_ATOMIC_STORE %4, %0 :: (store unordered 8 into %ir.addr) - - ; CHECK: G_ATOMIC_STORE %0(p0), %0(p0) :: (store unordered 8 into %ir.addr) - G_ATOMIC_STORE %0(p0), %0(p0) :: (store unordered 8 into %ir.addr) -... diff --git a/test/CodeGen/AArch64/GlobalISel/select-atomic-load.mir b/test/CodeGen/AArch64/GlobalISel/select-atomic-load.mir deleted file mode 100644 index 48a8a5187982e..0000000000000 --- a/test/CodeGen/AArch64/GlobalISel/select-atomic-load.mir +++ /dev/null @@ -1,431 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s - ---- | - target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - - define void @load_s8_gpr_unordered(i64* %addr) { ret void } - define void @load_s8_gpr_monotonic(i64* %addr) { ret void } - define void @load_s8_gpr_acquire(i64* %addr) { ret void } - define void @load_s8_gpr_release(i64* %addr) { ret void } - define void @load_s8_gpr_acq_rel(i64* %addr) { ret void } - define void @load_s8_gpr_seq_cst(i64* %addr) { ret void } - - define void @load_s32_gpr_unordered(i64* %addr) { ret void } - define void @load_s32_gpr_monotonic(i64* %addr) { ret void } - define void @load_s32_gpr_acquire(i64* %addr) { ret void } - define void @load_s32_gpr_release(i64* %addr) { ret void } - define void @load_s32_gpr_acq_rel(i64* %addr) { ret void } - define void @load_s32_gpr_seq_cst(i64* %addr) { ret void } - - define void @load_s64_gpr_unordered(i64* %addr) { ret void } - define void @load_s64_gpr_monotonic(i64* %addr) { ret void } - define void @load_s64_gpr_acquire(i64* %addr) { ret void } - define void @load_s64_gpr_release(i64* %addr) { ret void } - define void @load_s64_gpr_acq_rel(i64* %addr) { ret void } - define void @load_s64_gpr_seq_cst(i64* %addr) { ret void } -... - ---- -name: load_s8_gpr_unordered -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_unordered - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load release 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load release 4 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s8_gpr_monotonic -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_monotonic - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load release 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load release 4 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s8_gpr_acquire -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_acquire - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARB [[COPY]] :: (load acquire 1 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load acquire 1 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s8_gpr_release -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_release - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRBBui [[COPY]], 0 :: (load release 1 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load release 1 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s8_gpr_acq_rel -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_acq_rel - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARB [[COPY]] :: (load acq_rel 1 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load acq_rel 1 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s8_gpr_seq_cst -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s8_gpr_seq_cst - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARB [[COPY]] :: (load seq_cst 1 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s8) = G_ATOMIC_LOAD %0 :: (load seq_cst 1 from %ir.addr) - %2:gpr(s32) = G_ANYEXT %1 - %x0 = COPY %2(s32) -... - ---- -name: load_s32_gpr_unordered -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_unordered - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load release 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load release 4 from %ir.addr) - %x0 = COPY %1(s32) -... - ---- -name: load_s32_gpr_monotonic -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_monotonic - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load release 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load release 4 from %ir.addr) - %x0 = COPY %1(s32) -... - ---- -name: load_s32_gpr_acquire -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_acquire - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARW [[COPY]] :: (load acquire 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load acquire 4 from %ir.addr) - %x0 = COPY %1(s32) -... - ---- -name: load_s32_gpr_release -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_release - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load release 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load release 4 from %ir.addr) - %x0 = COPY %1(s32) -... - ---- -name: load_s32_gpr_acq_rel -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_acq_rel - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARW [[COPY]] :: (load acq_rel 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load acq_rel 4 from %ir.addr) - %x0 = COPY %1(s32) -... - ---- -name: load_s32_gpr_seq_cst -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s32_gpr_seq_cst - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[T0:%[0-9]+]]:gpr32 = LDARW [[COPY]] :: (load seq_cst 4 from %ir.addr) - ; CHECK: %x0 = COPY [[T0]] - %0(p0) = COPY %x0 - %1(s32) = G_ATOMIC_LOAD %0 :: (load seq_cst 4 from %ir.addr) - %x0 = COPY %1(s32) -... - - ---- -name: load_s64_gpr_unordered -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_unordered - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[COPY]], 0 :: (load release 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load release 8 from %ir.addr) - %x0 = COPY %1(s64) -... - ---- -name: load_s64_gpr_monotonic -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_monotonic - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[COPY]], 0 :: (load release 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load release 8 from %ir.addr) - %x0 = COPY %1(s64) -... - ---- -name: load_s64_gpr_acquire -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_acquire - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDARX [[COPY]] :: (load acquire 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load acquire 8 from %ir.addr) - %x0 = COPY %1(s64) -... - ---- -name: load_s64_gpr_release -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_release - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[COPY]], 0 :: (load release 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load release 8 from %ir.addr) - %x0 = COPY %1(s64) -... - ---- -name: load_s64_gpr_acq_rel -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_acq_rel - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDARX [[COPY]] :: (load acq_rel 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load acq_rel 8 from %ir.addr) - %x0 = COPY %1(s64) -... - ---- -name: load_s64_gpr_seq_cst -legalized: true -regBankSelected: true - -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - -body: | - bb.0: - liveins: %x0 - - ; CHECK-LABEL: name: load_s64_gpr_seq_cst - ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY %x0 - ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDARX [[COPY]] :: (load seq_cst 8 from %ir.addr) - ; CHECK: %x0 = COPY [[LDRXui]] - %0(p0) = COPY %x0 - %1(s64) = G_ATOMIC_LOAD %0 :: (load seq_cst 8 from %ir.addr) - %x0 = COPY %1(s64) -... - diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll index 8105fd44da91e..d0917e29495ad 100644 --- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,22 +1,37 @@ -; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOOPT %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s -; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 s[4:5] +; GCN-LABEL: {{^}}test_debug_value: +; NOOPT: s_load_dwordx2 s[4:5] ; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE? -; CHECK: ; kill: %sgpr4_sgpr5 %sgpr4_sgpr5 -; CHECK-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef +; NOOPT: ; kill: %sgpr8_sgpr9 %sgpr4_sgpr5 +; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef -; CHECK: buffer_store_dword -; CHECK: s_endpgm +; GCN: flat_store_dword +; GCN: s_endpgm define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { entry: - tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14 + tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, metadata !10, metadata !13), !dbg !14 store i32 123, i32 addrspace(1)* %globalptr_arg, align 4 ret void } -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 +; Check for infinite loop in some cases with dbg_value in +; SIOptimizeExecMaskingPreRA (somehow related to undef argument). + +; GCN-LABEL: {{^}}only_undef_dbg_value: +; NOOPT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef] undef +; NOOPT-NEXT: s_endpgm + +; OPT: s_endpgm +define amdgpu_kernel void @only_undef_dbg_value() #1 { +bb: + call void @llvm.dbg.value(metadata <4 x float> undef, metadata !10, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)) #2, !dbg !14 + ret void +} + +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir new file mode 100644 index 0000000000000..a89011a0cce01 --- /dev/null +++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -0,0 +1,333 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa-opencl -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- | + %struct.widget.0 = type { float, i32, i32 } + %struct.baz = type { <4 x float>, <4 x float>, <2 x float>, i32, i32 } + %struct.snork = type { float, float, float, i32, float, float, float, float, %struct.spam } + %struct.spam = type { %struct.zot, [16 x i8] } + %struct.zot = type { float, float, float, float, <4 x float> } + %struct.wombat = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, [2 x i16], [2 x i16] } + %struct.wombat.1 = type { [4 x i32], [4 x i32], [4 x i32], [4 x i32], i32, i32, i32, i32 } + + @sched_dbg_value_crash.tmp6 = internal unnamed_addr addrspace(3) global [256 x [16 x i8]] undef, align 16 + + define amdgpu_kernel void @sched_dbg_value_crash(i8 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture readonly %arg1, %struct.widget.0 addrspace(1)* nocapture readonly %arg2, %struct.baz addrspace(1)* nocapture readonly %arg3, %struct.snork addrspace(1)* nocapture %arg4) local_unnamed_addr #2 { + bb: + %0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %tmp5 = alloca %struct.wombat, align 16 + %1 = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %2 = bitcast i8 addrspace(2)* %1 to i32 addrspace(2)* + %3 = getelementptr inbounds i32, i32 addrspace(2)* %2, i64 1 + %4 = bitcast i32 addrspace(2)* %3 to <2 x i32> addrspace(2)*, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %5 = load <2 x i32>, <2 x i32> addrspace(2)* %4, align 4, !invariant.load !3 + %6 = extractelement <2 x i32> %5, i32 0 + %7 = extractelement <2 x i32> %5, i32 1 + %8 = lshr i32 %6, 16 + %9 = call i32 @llvm.amdgcn.workitem.id.x(), !range !4 + %10 = call i32 @llvm.amdgcn.workitem.id.y(), !range !4 + %11 = call i32 @llvm.amdgcn.workitem.id.z(), !range !4 + %12 = mul nuw nsw i32 %8, %7 + %13 = mul i32 %12, %9 + %14 = mul nuw nsw i32 %10, %7 + %15 = add i32 %13, %14 + %16 = add i32 %15, %11 + %17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16 + %tmp7 = load i64, i64 addrspace(2)* null, align 536870912 + %tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4 + %tmp9 = zext i32 %tmp8 to i64 + %tmp10 = add i64 %tmp7, %tmp9 + %tmp11 = shl i64 %tmp10, 32 + %tmp12 = ashr exact i64 %tmp11, 32 + %tmp13 = getelementptr inbounds %struct.widget.0, %struct.widget.0 addrspace(1)* %arg2, i64 %tmp12, i32 1 + %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 + %tmp15 = getelementptr inbounds %struct.baz, %struct.baz addrspace(1)* %arg3, i64 %tmp12, i32 1 + %tmp16 = load <4 x float>, <4 x float> addrspace(1)* %tmp15, align 16 + %tmp17 = sext i32 %tmp14 to i64 + %tmp18 = load i32, i32 addrspace(1)* %0, align 4 + %tmp19 = zext i32 %tmp18 to i64 + %tmp20 = shl nuw nsw i64 %tmp19, 2 + %tmp21 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp20 + %tmp22 = bitcast i8 addrspace(1)* %tmp21 to %struct.wombat.1 addrspace(1)* + %tmp23 = bitcast %struct.wombat* %tmp5 to i8* + call void @llvm.lifetime.start.p0i8(i64 144, i8* nonnull %tmp23) #3 + %tmp24 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp5, i32 0, i32 6 + %tmp25 = getelementptr i32, i32 addrspace(1)* %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4 + %tmp27 = zext i32 %tmp26 to i64 + %tmp28 = shl nuw nsw i64 %tmp27, 2 + %tmp29 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp28 + %tmp30 = bitcast i8 addrspace(1)* %tmp29 to <2 x float> addrspace(1)* + %tmp31 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 2, i64 0 + %18 = bitcast i32 addrspace(1)* %tmp31 to <3 x i32> addrspace(1)* + %19 = load <3 x i32>, <3 x i32> addrspace(1)* %18, align 4 + %tmp325 = extractelement <3 x i32> %19, i32 0 + %tmp386 = extractelement <3 x i32> %19, i32 1 + %tmp447 = extractelement <3 x i32> %19, i32 2 + %tmp33 = sext i32 %tmp325 to i64 + %tmp34 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp33 + %tmp35 = load <2 x float>, <2 x float> addrspace(1)* %tmp34, align 8 + %tmp36 = extractelement <2 x float> %tmp35, i32 1 + %tmp39 = sext i32 %tmp386 to i64 + %tmp40 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp39 + %tmp41 = load <2 x float>, <2 x float> addrspace(1)* %tmp40, align 8 + %tmp42 = extractelement <2 x float> %tmp41, i32 1 + %tmp45 = sext i32 %tmp447 to i64 + %tmp46 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp45 + %tmp47 = load <2 x float>, <2 x float> addrspace(1)* %tmp46, align 8 + %tmp48 = extractelement <2 x float> %tmp47, i32 1 + %tmp49 = getelementptr i32, i32 addrspace(1)* %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %tmp50 = load i32, i32 addrspace(1)* %tmp49, align 4 + %tmp51 = zext i32 %tmp50 to i64 + %tmp52 = shl nuw nsw i64 %tmp51, 2 + %tmp53 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp52 + %tmp54 = bitcast i8 addrspace(1)* %tmp53 to <4 x float> addrspace(1)* + %tmp55 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 0, i64 0 + %20 = bitcast i32 addrspace(1)* %tmp55 to <2 x i32> addrspace(1)* + %21 = load <2 x i32>, <2 x i32> addrspace(1)* %20, align 4 + %tmp568 = extractelement <2 x i32> %21, i32 0 + %tmp639 = extractelement <2 x i32> %21, i32 1 + %tmp57 = sext i32 %tmp568 to i64 + %tmp58 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp57 + %tmp59 = load <4 x float>, <4 x float> addrspace(1)* %tmp58, align 16 + %tmp60 = extractelement <4 x float> %tmp59, i32 0 + %tmp61 = extractelement <4 x float> %tmp59, i32 1 + %tmp64 = sext i32 %tmp639 to i64 + %tmp65 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp64 + %tmp66 = load <4 x float>, <4 x float> addrspace(1)* %tmp65, align 16 + %tmp67 = extractelement <4 x float> %tmp16, i64 0 + %tmp69 = fsub fast float -0.000000e+00, %tmp67 + %tmp70 = fmul float %tmp67, 0.000000e+00 + %tmp = fmul fast float %tmp67, undef + %tmp71 = fsub fast float %tmp, %tmp70 + %tmp73 = fadd fast float %tmp, undef + %tmp74 = insertelement <4 x float> , float %tmp69, i32 0 + %tmp75 = insertelement <4 x float> %tmp74, float %tmp71, i32 1 + %tmp76 = insertelement <4 x float> %tmp75, float %tmp73, i32 2 + store <4 x float> %tmp76, <4 x float>* %tmp24, align 16 + %tmp77 = fsub float undef, %tmp60 + %tmp78 = fsub float undef, %tmp61 + %tmp79 = extractelement <4 x float> %tmp66, i32 2 + %tmp80 = extractelement <4 x float> %tmp59, i32 2 + %tmp81 = fsub float %tmp79, %tmp80 + %tmp82 = fmul fast float %tmp81, undef + %tmp83 = fmul fast float %tmp78, undef + %tmp84 = fadd fast float %tmp83, %tmp77 + %tmp85 = fadd fast float %tmp84, undef + %tmp86 = fmul float %tmp82, %tmp82 + %tmp87 = fdiv float 1.000000e+00, %tmp86 + tail call void @llvm.dbg.value(metadata float %tmp87, metadata !5, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)) #3, !dbg !8 + %tmp88 = fmul float %tmp82, 0.000000e+00 + %tmp89 = fsub fast float %tmp85, %tmp88 + %tmp90 = fdiv float %tmp89, %tmp86 + %tmp91 = fsub float 1.000000e+00, %tmp87 + %tmp92 = fsub float %tmp91, %tmp90 + %tmp93 = fmul float %tmp42, %tmp87 + %tmp94 = call float @llvm.fmuladd.f32(float %tmp92, float %tmp36, float %tmp93) + %tmp95 = call float @llvm.fmuladd.f32(float %tmp48, float undef, float %tmp94) + %tmp96 = fsub float extractelement (<2 x float> fadd (<2 x float> fmul (<2 x float> undef, <2 x float> undef), <2 x float> undef), i64 1), %tmp95 + %tmp97 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp5, i32 0, i32 8, i32 1 + call void @func(float %tmp96, i64 0, i16* nonnull %tmp97) #3 + %tmp984 = bitcast [16 x i8] addrspace(3)* %17 to i8 addrspace(3)* + %tmp99 = getelementptr inbounds %struct.snork, %struct.snork addrspace(1)* %arg4, i64 %tmp12, i32 8, i32 1, i64 0 + call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %tmp99, i8 addrspace(3)* %tmp984, i64 16, i32 16, i1 false) + call void @llvm.lifetime.end.p0i8(i64 144, i8* nonnull %tmp23) #3 + ret void + } + + declare void @func(float, i64, i16*) + declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + declare float @llvm.fmuladd.f32(float, float, float) #1 + declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 + declare i32 @llvm.amdgcn.workitem.id.x() #1 + declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 + declare i32 @llvm.amdgcn.workitem.id.y() #1 + declare i32 @llvm.amdgcn.workitem.id.z() #1 + declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #0 + declare void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i64, i32, i1) #0 + + attributes #0 = { argmemonly nounwind } + attributes #1 = { nounwind readnone speculatable } + attributes #2 = { convergent nounwind "amdgpu-dispatch-ptr" "amdgpu-flat-scratch" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="gfx900" "target-features"="+fp32-denormals" } + attributes #3 = { nounwind } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !1 = !DIFile(filename: "foo.cl", directory: "/dev/null") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{} + !4 = !{i32 0, i32 256} + !5 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 102, type: !7) + !6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 81, isLocal: false, isDefinition: true, scopeLine: 86, flags: DIFlagPrototyped, isOptimized: true, unit: !0) + !7 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + !8 = !DILocation(line: 102, column: 8, scope: !6) + +... +--- + +# CHECK: name: sched_dbg_value_crash +# CHECK: DBG_VALUE debug-use %99, debug-use %noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 + +name: sched_dbg_value_crash +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%vgpr0', virtual-reg: '%0' } + - { reg: '%vgpr1', virtual-reg: '%1' } + - { reg: '%vgpr2', virtual-reg: '%2' } + - { reg: '%sgpr4_sgpr5', virtual-reg: '%3' } + - { reg: '%sgpr6_sgpr7', virtual-reg: '%4' } +fixedStack: +stack: + - { id: 0, name: tmp5, type: default, offset: 0, size: 128, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, di-variable: '', di-expression: '', di-location: '' } +constants: +body: | + bb.0.bb: + liveins: %vgpr0, %vgpr1, %vgpr2, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr32, %sgpr101 + + %4:sgpr_64 = COPY %sgpr6_sgpr7 + %3:sgpr_64 = COPY %sgpr4_sgpr5 + %2:vgpr_32 = COPY %vgpr2 + %1:vgpr_32 = COPY %vgpr1 + %0:vgpr_32 = COPY %vgpr0 + %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0 + %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0 + %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0 + %11:sreg_32_xm0 = S_LSHR_B32 %10.sub0, 16, implicit-def dead %scc + %12:sreg_32_xm0 = S_MUL_I32 %11, %10.sub1 + %13:vgpr_32 = V_MUL_LO_I32 0, %0, implicit %exec + %14:vgpr_32 = V_MUL_LO_I32 %1, %10.sub1, implicit %exec + %15:vgpr_32 = V_ADD_I32_e32 0, %13, implicit-def dead %vcc, implicit %exec + %16:vgpr_32 = V_ADD_I32_e32 0, %15, implicit-def dead %vcc, implicit %exec + %17:vgpr_32 = IMPLICIT_DEF + %18:sreg_64 = S_MOV_B64 0 + %19:sreg_32_xm0_xexec = IMPLICIT_DEF + %20:vgpr_32 = V_ADD_I32_e32 %19, %0, implicit-def dead %vcc, implicit %exec + %21:vreg_64, dead %22:sreg_64 = V_MAD_I64_I32 %20, 12, %7, 0, implicit %exec + %23:vgpr_32 = GLOBAL_LOAD_DWORD %21, 4, 0, 0, implicit %exec + %24:vreg_64, dead %25:sreg_64 = V_MAD_I64_I32 %20, 48, %8, 0, implicit %exec + %26:vreg_128 = IMPLICIT_DEF + undef %27.sub0:sreg_64_xexec = S_LOAD_DWORD_IMM %6, 0, 0 + %27.sub1:sreg_64_xexec = S_MOV_B32 0 + %28:sreg_64 = S_LSHL_B64 %27, 2, implicit-def dead %scc + undef %29.sub0:sreg_64 = S_ADD_U32 %5.sub0, %28.sub0, implicit-def %scc + %29.sub1:sreg_64 = S_ADDC_U32 %5.sub1, %28.sub1, implicit-def dead %scc, implicit killed %scc + undef %30.sub0:sreg_64_xexec = S_LOAD_DWORD_IMM %6, 4, 0 + %27.sub0:sreg_64_xexec = IMPLICIT_DEF + %31:sreg_64 = S_LSHL_B64 %27, 2, implicit-def dead %scc + %32:sreg_32_xm0 = S_ADD_U32 0, %31.sub0, implicit-def %scc + %33:sgpr_32 = S_ADDC_U32 %5.sub1, %31.sub1, implicit-def dead %scc, implicit killed %scc + %34:vgpr_32 = IMPLICIT_DEF + %35:vreg_64, dead %36:sreg_64 = V_MAD_I64_I32 %23, %34, 0, 0, implicit %exec + %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 32, 0, 0, implicit %exec + undef %38.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %37.sub0, implicit %exec + %38.sub0:vreg_64 = COPY %37.sub0 + %39:vreg_64 = V_LSHLREV_B64 3, %38, implicit %exec + undef %40.sub0:vreg_64, %41:sreg_64_xexec = V_ADD_I32_e64 0, %39.sub0, implicit %exec + %42:vgpr_32 = COPY %33 + %40.sub1:vreg_64, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %42, %39.sub1, %41, implicit %exec + %44:vreg_64 = GLOBAL_LOAD_DWORDX2 %40, 0, 0, 0, implicit %exec :: (load 8 from %ir.tmp34) + undef %45.sub1:vreg_64 = IMPLICIT_DEF + %45.sub0:vreg_64 = COPY %37.sub1 + %46:vreg_64 = V_LSHLREV_B64 3, %45, implicit %exec + undef %47.sub0:vreg_64, %48:sreg_64_xexec = V_ADD_I32_e64 %32, %46.sub0, implicit %exec + %49:vgpr_32 = COPY %33 + %47.sub1:vreg_64, dead %50:sreg_64_xexec = V_ADDC_U32_e64 %49, %46.sub1, %48, implicit %exec + %51:vreg_64 = IMPLICIT_DEF + undef %52.sub0:vreg_64 = GLOBAL_LOAD_DWORD %35, 40, 0, 0, implicit %exec :: (load 4 from %ir.18 + 8) + %52.sub1:vreg_64 = IMPLICIT_DEF + %53:vreg_64 = V_LSHLREV_B64 3, %52, implicit %exec + undef %54.sub0:vreg_64, %55:sreg_64_xexec = V_ADD_I32_e64 0, %53.sub0, implicit %exec + %56:vgpr_32 = COPY %33 + %54.sub1:vreg_64, dead %57:sreg_64_xexec = V_ADDC_U32_e64 0, %53.sub1, %55, implicit %exec + %58:vreg_64 = IMPLICIT_DEF + %30.sub1:sreg_64_xexec = IMPLICIT_DEF + %59:sreg_64 = IMPLICIT_DEF + %60:sreg_32_xm0 = S_ADD_U32 %5.sub0, %59.sub0, implicit-def %scc + %61:sgpr_32 = S_ADDC_U32 %5.sub1, %59.sub1, implicit-def dead %scc, implicit killed %scc + %62:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 0, 0, 0, implicit %exec :: (load 8 from %ir.20, align 4) + undef %63.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %62.sub0, implicit %exec + %63.sub0:vreg_64 = COPY %62.sub0 + %64:vreg_64 = IMPLICIT_DEF + undef %65.sub0:vreg_64, %66:sreg_64_xexec = V_ADD_I32_e64 %60, %64.sub0, implicit %exec + %67:vgpr_32 = COPY %61 + %65.sub1:vreg_64, dead %68:sreg_64_xexec = V_ADDC_U32_e64 %67, %64.sub1, %66, implicit %exec + %69:vreg_128 = GLOBAL_LOAD_DWORDX4 %65, 0, 0, 0, implicit %exec :: (load 16 from %ir.tmp58) + undef %70.sub1:vreg_64 = IMPLICIT_DEF + %70.sub0:vreg_64 = IMPLICIT_DEF + %71:vreg_64 = IMPLICIT_DEF + undef %72.sub0:vreg_64, %73:sreg_64_xexec = V_ADD_I32_e64 %60, %71.sub0, implicit %exec + %74:vgpr_32 = COPY %61 + %72.sub1:vreg_64, dead %75:sreg_64_xexec = V_ADDC_U32_e64 0, %71.sub1, %73, implicit %exec + %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, implicit %exec + %77:vgpr_32 = IMPLICIT_DEF + %78:vgpr_32 = IMPLICIT_DEF + %79:vgpr_32 = V_MUL_F32_e32 0, %77, implicit %exec + %80:vgpr_32 = IMPLICIT_DEF + %81:vgpr_32 = IMPLICIT_DEF + %84:vgpr_32 = IMPLICIT_DEF + BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr101, 108, 0, 0, 0, implicit %exec + BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr101, 104, 0, 0, 0, implicit %exec + BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr101, 100, 0, 0, 0, implicit %exec + BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr101, 96, 0, 0, 0, implicit %exec + %85:vgpr_32 = IMPLICIT_DEF + %86:vgpr_32 = IMPLICIT_DEF + %87:vgpr_32 = IMPLICIT_DEF + %88:vgpr_32 = IMPLICIT_DEF + %90:vgpr_32 = IMPLICIT_DEF + %91:vgpr_32, dead %92:sreg_64 = V_DIV_SCALE_F32 %90, %90, 1065353216, implicit %exec + %95:vgpr_32 = V_FMA_F32 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit %exec + %96:vgpr_32, %97:sreg_64 = V_DIV_SCALE_F32 1065353216, %90, 1065353216, implicit %exec + %98:vgpr_32 = IMPLICIT_DEF + %99:vgpr_32 = IMPLICIT_DEF + %100:vgpr_32 = IMPLICIT_DEF + %101:vgpr_32 = IMPLICIT_DEF + %102:vgpr_32 = IMPLICIT_DEF + %103:vgpr_32 = IMPLICIT_DEF + %104:vgpr_32 = IMPLICIT_DEF + %105:vgpr_32 = IMPLICIT_DEF + %106:vgpr_32, dead %107:sreg_64 = V_DIV_SCALE_F32 %90, %90, %105, implicit %exec + %108:vgpr_32 = V_RCP_F32_e32 0, implicit %exec + %109:vgpr_32 = IMPLICIT_DEF + %110:vgpr_32 = V_FMA_F32 0, 0, 0, 0, 0, 0, 0, 0, implicit %exec + %111:vgpr_32, %112:sreg_64 = V_DIV_SCALE_F32 0, 0, 0, implicit %exec + %113:vgpr_32 = V_MUL_F32_e32 0, %110, implicit %exec + %114:vgpr_32 = IMPLICIT_DEF + %115:vgpr_32 = IMPLICIT_DEF + %116:vgpr_32 = IMPLICIT_DEF + %vcc = IMPLICIT_DEF + %117:vgpr_32 = V_DIV_FMAS_F32 0, %116, 0, %110, 0, %115, 0, 0, implicit killed %vcc, implicit %exec + %118:vgpr_32 = V_DIV_FIXUP_F32 0, %117, 0, %90, 0, %105, 0, 0, implicit %exec + %119:vgpr_32 = IMPLICIT_DEF + %120:vgpr_32 = IMPLICIT_DEF + %121:vgpr_32 = IMPLICIT_DEF + %122:vgpr_32 = IMPLICIT_DEF + %123:vgpr_32 = IMPLICIT_DEF + %124:vgpr_32 = IMPLICIT_DEF + %125:vgpr_32 = IMPLICIT_DEF + %126:vgpr_32 = IMPLICIT_DEF + DBG_VALUE debug-use %103, debug-use _, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 + ADJCALLSTACKUP 0, 0, implicit-def %sgpr32, implicit %sgpr32 + %127:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead %scc + %sgpr4 = COPY %sgpr101 + %vgpr0 = COPY %124 + %vgpr1_vgpr2 = IMPLICIT_DEF + %vgpr3 = COPY %126 + dead %sgpr30_sgpr31 = SI_CALL %127, @func, csr_amdgpu_highregs, implicit %sgpr0_sgpr1_sgpr2_sgpr3, implicit %sgpr4, implicit %vgpr0, implicit %vgpr1_vgpr2, implicit killed %vgpr3 + ADJCALLSTACKDOWN 0, 0, implicit-def %sgpr32, implicit %sgpr32 + %128:vreg_64, dead %129:sreg_64 = V_MAD_I64_I32 %20, %34, 0, 0, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 8c67d6e742716..de5229e0550a0 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -497,3 +497,26 @@ entry: store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 ret void } + +; GCN-LABEL: {{^}}sdwa_crash_inlineasm_de +; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff +; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x10000, +define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { +bb: + br label %bb1 + +bb1: ; preds = %bb11, %bb + %tmp = phi <2 x i32> [ %tmp12, %bb11 ], [ undef, %bb ] + br i1 true, label %bb2, label %bb11 + +bb2: ; preds = %bb1 + %tmp3 = call i32 asm "v_and_b32_e32 $0, $1, $2", "=v,s,v"(i32 65535, i32 undef) #1 + %tmp5 = or i32 %tmp3, 65536 + %tmp6 = insertelement <2 x i32> %tmp, i32 %tmp5, i64 0 + br label %bb11 + +bb11: ; preds = %bb10, %bb2 + %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ] + br label %bb1 +} diff --git a/test/CodeGen/ARM/and-load-combine.ll b/test/CodeGen/ARM/and-load-combine.ll index 6f0c12359597c..f8e5d411c97cc 100644 --- a/test/CodeGen/ARM/and-load-combine.ll +++ b/test/CodeGen/ARM/and-load-combine.ll @@ -5,34 +5,30 @@ ; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefix=THUMB2 define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { ; ARM-LABEL: cmp_xor8_short_short: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: ldrh r1, [r1] -; ARM-NEXT: eor r1, r1, r0 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: teq r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_xor8_short_short: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: ldrh r1, [r1] -; ARMEB-NEXT: eor r1, r1, r0 +; ARMEB: ldrb r2, [r0, #1] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #1] +; ARMEB-NEXT: teq r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_xor8_short_short: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r0, [r0] -; THUMB1-NEXT: ldrh r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: eors r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB0_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -40,16 +36,13 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_xor8_short_short: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ldrh r1, [r1] -; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: teq.w r1, r2 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i16* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %1 = load i16, i16* %b, align 2 @@ -60,34 +53,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_xor8_short_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: eor r1, r1, r0 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: teq r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_xor8_short_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: eor r1, r1, r0 +; ARMEB: ldrb r2, [r0, #1] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: teq r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_xor8_short_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: eors r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB1_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -95,16 +84,13 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_xor8_short_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: teq.w r1, r2 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %conv = zext i16 %0 to i32 @@ -116,34 +102,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_xor8_int_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: eor r1, r1, r0 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: teq r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_xor8_int_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: eor r1, r1, r0 +; ARMEB: ldrb r2, [r0, #3] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: teq r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_xor8_int_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: eors r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB2_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -151,16 +133,13 @@ define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_xor8_int_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: teq.w r1, r2 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -171,36 +150,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_xor16: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: movw r2, #65535 -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: eor r1, r1, r0 +; ARM: ldrh r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, r2 +; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: teq r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_xor16: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: movw r2, #65535 -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: eor r1, r1, r0 +; ARMEB: ldrh r2, [r0, #2] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, r2 +; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: teq r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_xor16: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrh r0, [r0] +; THUMB1-NEXT: ldrh r2, [r1] ; THUMB1-NEXT: eors r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #16 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB3_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -208,16 +181,13 @@ define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_xor16: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: lsls r0, r0, #16 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrh r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrh r1, [r1] +; THUMB2-NEXT: teq.w r1, r2 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -228,34 +198,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { ; ARM-LABEL: cmp_or8_short_short: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: ldrh r1, [r1] -; ARM-NEXT: orr r1, r1, r0 +; ARM: ldrb r0, [r0] +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: orrs r0, r1, r0 ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_or8_short_short: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: ldrh r1, [r1] -; ARMEB-NEXT: orr r1, r1, r0 +; ARMEB: ldrb r0, [r0, #1] +; ARMEB-NEXT: ldrb r1, [r1, #1] +; ARMEB-NEXT: orrs r0, r1, r0 ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_or8_short_short: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r0, [r0] -; THUMB1-NEXT: ldrh r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: orrs r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB4_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -263,16 +229,13 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_or8_short_short: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ldrh r1, [r1] +; THUMB2: ldrb r0, [r0] +; THUMB2-NEXT: ldrb r1, [r1] ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 ; THUMB2-NEXT: mov.w r0, #0 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i16* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %1 = load i16, i16* %b, align 2 @@ -283,34 +246,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_or8_short_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: orr r1, r1, r0 +; ARM: ldrb r0, [r0] +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: orrs r0, r1, r0 ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_or8_short_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: orr r1, r1, r0 +; ARMEB: ldrb r0, [r0, #1] +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: orrs r0, r1, r0 ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_or8_short_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: orrs r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB5_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -318,16 +277,13 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_or8_short_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] +; THUMB2: ldrb r0, [r0] +; THUMB2-NEXT: ldrb r1, [r1] ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 ; THUMB2-NEXT: mov.w r0, #0 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %conv = zext i16 %0 to i32 @@ -339,34 +295,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_or8_int_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: orr r1, r1, r0 +; ARM: ldrb r0, [r0] +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: orrs r0, r1, r0 ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_or8_int_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: orr r1, r1, r0 +; ARMEB: ldrb r0, [r0, #3] +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: orrs r0, r1, r0 ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_or8_int_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrb r0, [r0] +; THUMB1-NEXT: ldrb r2, [r1] ; THUMB1-NEXT: orrs r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB6_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -374,16 +326,13 @@ define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_or8_int_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] +; THUMB2: ldrb r0, [r0] +; THUMB2-NEXT: ldrb r1, [r1] ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 ; THUMB2-NEXT: mov.w r0, #0 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -394,36 +343,30 @@ entry: } define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_or16: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: movw r2, #65535 -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: orr r1, r1, r0 +; ARM: ldrh r0, [r0] +; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: orrs r0, r1, r0 ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_or16: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: movw r2, #65535 -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: orr r1, r1, r0 +; ARMEB: ldrh r0, [r0, #2] +; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: orrs r0, r1, r0 ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_or16: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] +; THUMB1: ldrh r0, [r0] +; THUMB1-NEXT: ldrh r2, [r1] ; THUMB1-NEXT: orrs r2, r0 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #16 +; THUMB1-NEXT: cmp r2, #0 ; THUMB1-NEXT: beq .LBB7_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -431,16 +374,13 @@ define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_or16: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] +; THUMB2: ldrh r0, [r0] +; THUMB2-NEXT: ldrh r1, [r1] ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: lsls r0, r0, #16 ; THUMB2-NEXT: mov.w r0, #0 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -451,34 +391,29 @@ entry: } define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a, + i16* nocapture readonly %b) { ; ARM-LABEL: cmp_and8_short_short: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r1, [r1] -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: and r1, r0, r1 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: tst r2, r1 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_and8_short_short: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r1, [r1] -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: and r1, r0, r1 +; ARMEB: ldrb r2, [r0, #1] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #1] +; ARMEB-NEXT: tst r2, r1 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_and8_short_short: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r1, [r1] -; THUMB1-NEXT: ldrh r2, [r0] -; THUMB1-NEXT: ands r2, r1 +; THUMB1: ldrb r2, [r1] +; THUMB1-NEXT: ldrb r3, [r0] ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: tst r3, r2 ; THUMB1-NEXT: beq .LBB8_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -486,16 +421,13 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_and8_short_short: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r1, [r1] -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ands r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: tst r2, r1 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i16* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %1 = load i16, i16* %b, align 2 @@ -506,34 +438,29 @@ entry: } define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_and8_short_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r0, [r0] -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: and r1, r1, r0 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: tst r1, r2 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_and8_short_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r0, [r0] -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: and r1, r1, r0 +; ARMEB: ldrb r2, [r0, #1] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: tst r1, r2 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_and8_short_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r0, [r0] -; THUMB1-NEXT: ldr r2, [r1] -; THUMB1-NEXT: ands r2, r0 +; THUMB1: ldrb r2, [r0] +; THUMB1-NEXT: ldrb r3, [r1] ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: tst r3, r2 ; THUMB1-NEXT: beq .LBB9_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -541,16 +468,13 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_and8_short_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r0, [r0] -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ands r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: tst r1, r2 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i16, i16* %a, align 2 %1 = load i32, i32* %b, align 4 @@ -562,34 +486,29 @@ entry: } define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_and8_int_int: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: and r1, r0, r1 +; ARM: ldrb r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, #255 +; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: tst r2, r1 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_and8_int_int: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: and r1, r0, r1 +; ARMEB: ldrb r2, [r0, #3] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, #255 +; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: tst r2, r1 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_and8_int_int: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r1, [r1] -; THUMB1-NEXT: ldr r2, [r0] -; THUMB1-NEXT: ands r2, r1 +; THUMB1: ldrb r2, [r1] +; THUMB1-NEXT: ldrb r3, [r0] ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #24 +; THUMB1-NEXT: tst r3, r2 ; THUMB1-NEXT: beq .LBB10_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -597,16 +516,13 @@ define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_and8_int_int: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ands r0, r1 -; THUMB2-NEXT: lsls r0, r0, #24 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrb r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: tst r2, r1 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -617,36 +533,29 @@ entry: } define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a, + i32* nocapture readonly %b) { ; ARM-LABEL: cmp_and16: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: movw r2, #65535 -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: and r1, r0, r1 +; ARM: ldrh r2, [r0] ; ARM-NEXT: mov r0, #0 -; ARM-NEXT: tst r1, r2 +; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: tst r2, r1 ; ARM-NEXT: movweq r0, #1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: cmp_and16: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: movw r2, #65535 -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: and r1, r0, r1 +; ARMEB: ldrh r2, [r0, #2] ; ARMEB-NEXT: mov r0, #0 -; ARMEB-NEXT: tst r1, r2 +; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: tst r2, r1 ; ARMEB-NEXT: movweq r0, #1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: cmp_and16: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r1, [r1] -; THUMB1-NEXT: ldr r2, [r0] -; THUMB1-NEXT: ands r2, r1 +; THUMB1: ldrh r2, [r1] +; THUMB1-NEXT: ldrh r3, [r0] ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: lsls r2, r2, #16 +; THUMB1-NEXT: tst r3, r2 ; THUMB1-NEXT: beq .LBB11_2 ; THUMB1-NEXT: @ %bb.1: @ %entry ; THUMB1-NEXT: mov r0, r1 @@ -654,16 +563,13 @@ define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a, ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: cmp_and16: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: ands r0, r1 -; THUMB2-NEXT: lsls r0, r0, #16 -; THUMB2-NEXT: mov.w r0, #0 +; THUMB2: ldrh r2, [r0] +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: ldrh r1, [r1] +; THUMB2-NEXT: tst r2, r1 ; THUMB2-NEXT: it eq ; THUMB2-NEXT: moveq r0, #1 ; THUMB2-NEXT: bx lr - i32* nocapture readonly %b) { entry: %0 = load i32, i32* %a, align 4 %1 = load i32, i32* %b, align 4 @@ -675,35 +581,31 @@ entry: define arm_aapcscc i32 @add_and16(i32* nocapture readonly %a, i32 %y, i32 %z) { ; ARM-LABEL: add_and16: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] -; ARM-NEXT: add r1, r1, r2 +; ARM: add r1, r1, r2 +; ARM-NEXT: ldrh r0, [r0] +; ARM-NEXT: uxth r1, r1 ; ARM-NEXT: orr r0, r0, r1 -; ARM-NEXT: uxth r0, r0 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: add_and16: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] -; ARMEB-NEXT: add r1, r1, r2 +; ARMEB: add r1, r1, r2 +; ARMEB-NEXT: ldrh r0, [r0, #2] +; ARMEB-NEXT: uxth r1, r1 ; ARMEB-NEXT: orr r0, r0, r1 -; ARMEB-NEXT: uxth r0, r0 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: add_and16: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: adds r1, r1, r2 -; THUMB1-NEXT: ldr r0, [r0] +; THUMB1: adds r1, r1, r2 +; THUMB1-NEXT: uxth r1, r1 +; THUMB1-NEXT: ldrh r0, [r0] ; THUMB1-NEXT: orrs r0, r1 -; THUMB1-NEXT: uxth r0, r0 ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: add_and16: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] -; THUMB2-NEXT: add r1, r2 +; THUMB2: add r1, r2 +; THUMB2-NEXT: ldrh r0, [r0] +; THUMB2-NEXT: uxth r1, r1 ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: uxth r0, r0 ; THUMB2-NEXT: bx lr entry: %x = load i32, i32* %a, align 4 @@ -715,43 +617,39 @@ entry: define arm_aapcscc i32 @test1(i32* %a, i32* %b, i32 %x, i32 %y) { ; ARM-LABEL: test1: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: mul r2, r2, r3 -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: ldr r0, [r0] +; ARM: mul r2, r2, r3 +; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: ldrh r0, [r0] ; ARM-NEXT: eor r0, r0, r1 -; ARM-NEXT: orr r0, r0, r2 -; ARM-NEXT: uxth r0, r0 +; ARM-NEXT: uxth r1, r2 +; ARM-NEXT: orr r0, r0, r1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test1: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: mul r2, r2, r3 -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: ldr r0, [r0] +; ARMEB: mul r2, r2, r3 +; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: ldrh r0, [r0, #2] ; ARMEB-NEXT: eor r0, r0, r1 -; ARMEB-NEXT: orr r0, r0, r2 -; ARMEB-NEXT: uxth r0, r0 +; ARMEB-NEXT: uxth r1, r2 +; ARMEB-NEXT: orr r0, r0, r1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test1: -; THUMB1: @ %bb.0: @ %entry +; THUMB1: ldrh r1, [r1] +; THUMB1-NEXT: ldrh r4, [r0] +; THUMB1-NEXT: eors r4, r1 ; THUMB1-NEXT: muls r2, r3, r2 -; THUMB1-NEXT: ldr r1, [r1] -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: eors r0, r1 -; THUMB1-NEXT: orrs r0, r2 -; THUMB1-NEXT: uxth r0, r0 -; THUMB1-NEXT: bx lr +; THUMB1-NEXT: uxth r0, r2 +; THUMB1-NEXT: orrs r0, r4 +; THUMB1-NEXT: pop ; ; THUMB2-LABEL: test1: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: muls r2, r3, r2 -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ldr r0, [r0] +; THUMB2: ldrh r1, [r1] +; THUMB2-NEXT: ldrh r0, [r0] ; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: orrs r0, r2 -; THUMB2-NEXT: uxth r0, r0 +; THUMB2-NEXT: mul r1, r2, r3 +; THUMB2-NEXT: uxth r1, r1 +; THUMB2-NEXT: orrs r0, r1 ; THUMB2-NEXT: bx lr entry: %0 = load i32, i32* %a, align 4 @@ -765,8 +663,7 @@ entry: define arm_aapcscc i32 @test2(i32* %a, i32* %b, i32 %x, i32 %y) { ; ARM-LABEL: test2: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r1, [r1] +; ARM: ldr r1, [r1] ; ARM-NEXT: ldr r0, [r0] ; ARM-NEXT: mul r1, r2, r1 ; ARM-NEXT: eor r0, r0, r3 @@ -775,8 +672,7 @@ define arm_aapcscc i32 @test2(i32* %a, i32* %b, i32 %x, i32 %y) { ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test2: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r1, [r1] +; ARMEB: ldr r1, [r1] ; ARMEB-NEXT: ldr r0, [r0] ; ARMEB-NEXT: mul r1, r2, r1 ; ARMEB-NEXT: eor r0, r0, r3 @@ -785,8 +681,7 @@ define arm_aapcscc i32 @test2(i32* %a, i32* %b, i32 %x, i32 %y) { ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test2: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r1, [r1] +; THUMB1: ldr r1, [r1] ; THUMB1-NEXT: muls r1, r2, r1 ; THUMB1-NEXT: ldr r0, [r0] ; THUMB1-NEXT: eors r0, r3 @@ -795,8 +690,7 @@ define arm_aapcscc i32 @test2(i32* %a, i32* %b, i32 %x, i32 %y) { ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: test2: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r1, [r1] +; THUMB2: ldr r1, [r1] ; THUMB2-NEXT: ldr r0, [r0] ; THUMB2-NEXT: muls r1, r2, r1 ; THUMB2-NEXT: eors r0, r3 @@ -815,8 +709,7 @@ entry: define arm_aapcscc i32 @test3(i32* %a, i32* %b, i32 %x, i16* %y) { ; ARM-LABEL: test3: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r0, [r0] +; ARM: ldr r0, [r0] ; ARM-NEXT: mul r1, r2, r0 ; ARM-NEXT: ldrh r2, [r3] ; ARM-NEXT: eor r0, r0, r2 @@ -825,8 +718,7 @@ define arm_aapcscc i32 @test3(i32* %a, i32* %b, i32 %x, i16* %y) { ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test3: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r0, [r0] +; ARMEB: ldr r0, [r0] ; ARMEB-NEXT: mul r1, r2, r0 ; ARMEB-NEXT: ldrh r2, [r3] ; ARMEB-NEXT: eor r0, r0, r2 @@ -835,8 +727,7 @@ define arm_aapcscc i32 @test3(i32* %a, i32* %b, i32 %x, i16* %y) { ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test3: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r0, [r0] +; THUMB1: ldr r0, [r0] ; THUMB1-NEXT: muls r2, r0, r2 ; THUMB1-NEXT: ldrh r1, [r3] ; THUMB1-NEXT: eors r1, r0 @@ -845,8 +736,7 @@ define arm_aapcscc i32 @test3(i32* %a, i32* %b, i32 %x, i16* %y) { ; THUMB1-NEXT: bx lr ; ; THUMB2-LABEL: test3: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r0, [r0] +; THUMB2: ldr r0, [r0] ; THUMB2-NEXT: mul r1, r2, r0 ; THUMB2-NEXT: ldrh r2, [r3] ; THUMB2-NEXT: eors r0, r2 @@ -866,43 +756,39 @@ entry: define arm_aapcscc i32 @test4(i32* %a, i32* %b, i32 %x, i32 %y) { ; ARM-LABEL: test4: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: mul r2, r2, r3 -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: ldr r0, [r0] +; ARM: mul r2, r2, r3 +; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: ldrh r0, [r0] ; ARM-NEXT: eor r0, r0, r1 -; ARM-NEXT: orr r0, r0, r2 -; ARM-NEXT: uxth r0, r0 +; ARM-NEXT: uxth r1, r2 +; ARM-NEXT: orr r0, r0, r1 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test4: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: mul r2, r2, r3 -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: ldr r0, [r0] +; ARMEB: mul r2, r2, r3 +; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: ldrh r0, [r0, #2] ; ARMEB-NEXT: eor r0, r0, r1 -; ARMEB-NEXT: orr r0, r0, r2 -; ARMEB-NEXT: uxth r0, r0 +; ARMEB-NEXT: uxth r1, r2 +; ARMEB-NEXT: orr r0, r0, r1 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test4: -; THUMB1: @ %bb.0: @ %entry +; THUMB1: ldrh r1, [r1] +; THUMB1-NEXT: ldrh r4, [r0] +; THUMB1-NEXT: eors r4, r1 ; THUMB1-NEXT: muls r2, r3, r2 -; THUMB1-NEXT: ldr r1, [r1] -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: eors r0, r1 -; THUMB1-NEXT: orrs r0, r2 -; THUMB1-NEXT: uxth r0, r0 -; THUMB1-NEXT: bx lr +; THUMB1-NEXT: uxth r0, r2 +; THUMB1-NEXT: orrs r0, r4 +; THUMB1-NEXT: pop ; ; THUMB2-LABEL: test4: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: muls r2, r3, r2 -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ldr r0, [r0] +; THUMB2: ldrh r1, [r1] +; THUMB2-NEXT: ldrh r0, [r0] ; THUMB2-NEXT: eors r0, r1 -; THUMB2-NEXT: orrs r0, r2 -; THUMB2-NEXT: uxth r0, r0 +; THUMB2-NEXT: mul r1, r2, r3 +; THUMB2-NEXT: uxth r1, r1 +; THUMB2-NEXT: orrs r0, r1 ; THUMB2-NEXT: bx lr entry: %0 = load i32, i32* %a, align 4 @@ -916,43 +802,39 @@ entry: define arm_aapcscc i32 @test5(i32* %a, i32* %b, i32 %x, i16 zeroext %y) { ; ARM-LABEL: test5: -; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldr r1, [r1] -; ARM-NEXT: ldr r0, [r0] +; ARM: ldr r1, [r1] +; ARM-NEXT: ldrh r0, [r0] ; ARM-NEXT: mul r1, r2, r1 ; ARM-NEXT: eor r0, r0, r3 +; ARM-NEXT: uxth r1, r1 ; ARM-NEXT: orr r0, r0, r1 -; ARM-NEXT: uxth r0, r0 ; ARM-NEXT: bx lr ; ; ARMEB-LABEL: test5: -; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldr r1, [r1] -; ARMEB-NEXT: ldr r0, [r0] +; ARMEB: ldr r1, [r1] +; ARMEB-NEXT: ldrh r0, [r0, #2] ; ARMEB-NEXT: mul r1, r2, r1 ; ARMEB-NEXT: eor r0, r0, r3 +; ARMEB-NEXT: uxth r1, r1 ; ARMEB-NEXT: orr r0, r0, r1 -; ARMEB-NEXT: uxth r0, r0 ; ARMEB-NEXT: bx lr ; ; THUMB1-LABEL: test5: -; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldr r1, [r1] -; THUMB1-NEXT: muls r1, r2, r1 -; THUMB1-NEXT: ldr r0, [r0] -; THUMB1-NEXT: eors r0, r3 -; THUMB1-NEXT: orrs r0, r1 +; THUMB1: ldrh r4, [r0] +; THUMB1-NEXT: eors r4, r3 +; THUMB1-NEXT: ldr r0, [r1] +; THUMB1-NEXT: muls r0, r2, r0 ; THUMB1-NEXT: uxth r0, r0 -; THUMB1-NEXT: bx lr +; THUMB1-NEXT: orrs r0, r4 +; THUMB1-NEXT: pop ; ; THUMB2-LABEL: test5: -; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldr r1, [r1] -; THUMB2-NEXT: ldr r0, [r0] +; THUMB2: ldr r1, [r1] +; THUMB2-NEXT: ldrh r0, [r0] ; THUMB2-NEXT: muls r1, r2, r1 ; THUMB2-NEXT: eors r0, r3 +; THUMB2-NEXT: uxth r1, r1 ; THUMB2-NEXT: orrs r0, r1 -; THUMB2-NEXT: uxth r0, r0 ; THUMB2-NEXT: bx lr entry: %0 = load i32, i32* %a, align 4 @@ -964,3 +846,4 @@ entry: %and = and i32 %or, 65535 ret i32 %and } + diff --git a/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll b/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll new file mode 100644 index 0000000000000..0f8f18a178795 --- /dev/null +++ b/test/CodeGen/PowerPC/combine_loads_from_build_pair.ll @@ -0,0 +1,21 @@ +; RUN: llc -verify-machineinstrs -O0 -mcpu=g4 -mtriple=powerpc-apple-darwin8 < %s -debug -stop-after=machineverifier 2>&1 | FileCheck %s + +; REQUIRES: asserts + +define i64 @func1(i64 %p1, i64 %p2, i64 %p3, i64 %p4, { i64, i8* } %struct) { +; Verify that we get a combine on the build_pair, creating a LD8 load somewhere +; between "Initial selection DAG" and "Optimized lowered selection DAG". +; The target is big-endian, and stack grows towards higher addresses, +; so we expect the LD8 to load from the address used in the original HIBITS +; load. +; CHECK-LABEL: Initial selection DAG: +; CHECK-DAG: [[LOBITS:t[0-9]+]]: i32,ch = load +; CHECK-DAG: [[HIBITS:t[0-9]+]]: i32,ch = load +; CHECK: Combining: t{{[0-9]+}}: i64 = build_pair [[LOBITS]], [[HIBITS]] +; CHECK-NEXT: into +; CHECK-SAME: loadconvertToThreeAddress() +; which contains a %noreg operand. + +define i32 @f23(i32 %old) { + %and1 = and i32 %old, 14 + %and2 = and i32 %old, 254 + %res1 = call i32 asm "stepa $1, $2, $3", "=h,r,r,0"(i32 %old, i32 %and1, i32 %and2) + %and3 = and i32 %res1, 127 + %and4 = and i32 %res1, 128 + %res2 = call i32 asm "stepb $1, $2, $3", "=r,h,h,0"(i32 %res1, i32 %and3, i32 %and4) + ret i32 %res2 +} diff --git a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll index 73d4c47a84094..2901cf0f29a8d 100644 --- a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll +++ b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll @@ -1,26 +1,23 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s ; ; Check that a widening truncate to a vector of i1 elements can be handled. - +; NOTE: REG2 is actually not needed (tempororary FAIL) define void @pr32275(<4 x i8> %B15) { ; CHECK-LABEL: pr32275: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vrepif %v0, 1 -; CHECK-NEXT: .LBB0_1: # %CF34 -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vlgvb %r0, %v24, 3 +; CHECK-NEXT: vrepif [[REG0:%v[0-9]]], 1 +; CHECK: vlgvb %r0, %v24, 3 ; CHECK-NEXT: vlgvb %r1, %v24, 1 -; CHECK-NEXT: vlvgp %v1, %r1, %r0 +; CHECK-NEXT: vlvgp [[REG1:%v[0-9]]], %r1, %r0 ; CHECK-NEXT: vlgvb %r0, %v24, 0 -; CHECK-NEXT: vlvgf %v1, %r0, 0 -; CHECK-NEXT: vlgvb %r0, %v24, 2 -; CHECK-NEXT: vlvgf %v1, %r0, 2 -; CHECK-NEXT: vn %v1, %v1, %v0 -; CHECK-NEXT: vlgvf %r0, %v1, 3 -; CHECK-NEXT: tmll %r0, 1 +; CHECK-DAG: vlr [[REG2:%v[0-9]]], [[REG1]] +; CHECK-DAG: vlvgf [[REG2]], %r0, 0 +; CHECK-DAG: vlgvb [[REG3:%r[0-9]]], %v24, 2 +; CHECK-NEXT: vlvgf [[REG2]], [[REG3]], 2 +; CHECK-NEXT: vn [[REG2]], [[REG2]], [[REG0]] +; CHECK-NEXT: vlgvf [[REG4:%r[0-9]]], [[REG2]], 3 +; CHECK-NEXT: tmll [[REG4]], 1 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %CF36 ; CHECK-NEXT: br %r14 diff --git a/test/CodeGen/WebAssembly/globl.ll b/test/CodeGen/WebAssembly/globl.ll index ba9f6659d7d73..c3126d5586369 100644 --- a/test/CodeGen/WebAssembly/globl.ll +++ b/test/CodeGen/WebAssembly/globl.ll @@ -4,11 +4,14 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown-wasm" ; CHECK: .globl foo +; CHECK: .type foo,@function ; CHECK-LABEL: foo: -define void @foo() { - ret void +; CHECK: .size foo, +define i32* @foo() { + ret i32* @bar } -; Check import directives - must be at the end of the file -; CHECK: .import_global bar{{$}} -@bar = external global i32 +; CHECK: .type bar,@object +; CHECK: .globl bar +; CHECK: .size bar, 4 +@bar = global i32 2 diff --git a/test/CodeGen/WebAssembly/stack-alignment.ll b/test/CodeGen/WebAssembly/stack-alignment.ll index 25e9d06db411b..6128c8a4d2358 100644 --- a/test/CodeGen/WebAssembly/stack-alignment.ll +++ b/test/CodeGen/WebAssembly/stack-alignment.ll @@ -147,3 +147,5 @@ entry: call void @somefunc(i32* %static) ret void } + +; CHECK: .stack_pointer __stack_pointer diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll index 4eacbdd4ccb53..578d7aa75287a 100644 --- a/test/CodeGen/X86/avg-mask.ll +++ b/test/CodeGen/X86/avg-mask.ll @@ -252,8 +252,8 @@ define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -280,8 +280,8 @@ define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index 1c75035020440..bb6afdede9f14 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -236,7 +236,7 @@ define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) { ; GENERIC-LABEL: test_gatherdpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherdpd: @@ -271,7 +271,7 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 define <4 x double> @test_gatherdpd_ymm(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3) { ; GENERIC-LABEL: test_gatherdpd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 +; GENERIC-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherdpd_ymm: @@ -306,7 +306,7 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32> define <4 x float> @test_gatherdps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3) { ; GENERIC-LABEL: test_gatherdps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherdps: @@ -341,7 +341,7 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x define <8 x float> @test_gatherdps_ymm(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3) { ; GENERIC-LABEL: test_gatherdps_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 +; GENERIC-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherdps_ymm: @@ -376,7 +376,7 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, define <2 x double> @test_gatherqpd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3) { ; GENERIC-LABEL: test_gatherqpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherqpd: @@ -411,7 +411,7 @@ declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 define <4 x double> @test_gatherqpd_ymm(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3) { ; GENERIC-LABEL: test_gatherqpd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 +; GENERIC-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherqpd_ymm: @@ -446,7 +446,7 @@ declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64> define <4 x float> @test_gatherqps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3) { ; GENERIC-LABEL: test_gatherqps: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_gatherqps: @@ -481,7 +481,7 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x define <4 x float> @test_gatherqps_ymm(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3) { ; GENERIC-LABEL: test_gatherqps_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 +; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2750,7 +2750,7 @@ define <4 x i64> @test_permq(<4 x i64> %a0, <4 x i64> *%a1) { define <4 x i32> @test_pgatherdd(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3) { ; GENERIC-LABEL: test_pgatherdd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherdd: @@ -2760,7 +2760,7 @@ define <4 x i32> @test_pgatherdd(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32 ; ; BROADWELL-LABEL: test_pgatherdd: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 +; BROADWELL-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherdd: @@ -2785,7 +2785,7 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32> define <8 x i32> @test_pgatherdd_ymm(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3) { ; GENERIC-LABEL: test_pgatherdd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 +; GENERIC-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherdd_ymm: @@ -2795,7 +2795,7 @@ define <8 x i32> @test_pgatherdd_ymm(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x ; ; BROADWELL-LABEL: test_pgatherdd_ymm: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 +; BROADWELL-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherdd_ymm: @@ -2820,7 +2820,7 @@ declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x define <2 x i64> @test_pgatherdq(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3) { ; GENERIC-LABEL: test_pgatherdq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherdq: @@ -2830,7 +2830,7 @@ define <2 x i64> @test_pgatherdq(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64 ; ; BROADWELL-LABEL: test_pgatherdq: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 +; BROADWELL-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherdq: @@ -2855,7 +2855,7 @@ declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64> define <4 x i64> @test_pgatherdq_ymm(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3) { ; GENERIC-LABEL: test_pgatherdq_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 +; GENERIC-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherdq_ymm: @@ -2865,7 +2865,7 @@ define <4 x i64> @test_pgatherdq_ymm(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x ; ; BROADWELL-LABEL: test_pgatherdq_ymm: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 +; BROADWELL-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherdq_ymm: @@ -2890,7 +2890,7 @@ declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x define <4 x i32> @test_pgatherqd(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3) { ; GENERIC-LABEL: test_pgatherqd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherqd: @@ -2900,7 +2900,7 @@ define <4 x i32> @test_pgatherqd(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32 ; ; BROADWELL-LABEL: test_pgatherqd: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 +; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherqd: @@ -2925,7 +2925,7 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32> define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3) { ; GENERIC-LABEL: test_pgatherqd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 +; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2937,7 +2937,7 @@ define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x ; ; BROADWELL-LABEL: test_pgatherqd_ymm: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 +; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50] ; BROADWELL-NEXT: vzeroupper # sched: [4:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; @@ -2966,7 +2966,7 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x define <2 x i64> @test_pgatherqq(<2 x i64> %a0, i8 *%a1, <2 x i64> %a2, <2 x i64> %a3) { ; GENERIC-LABEL: test_pgatherqq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 +; GENERIC-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherqq: @@ -2976,7 +2976,7 @@ define <2 x i64> @test_pgatherqq(<2 x i64> %a0, i8 *%a1, <2 x i64> %a2, <2 x i64 ; ; BROADWELL-LABEL: test_pgatherqq: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 +; BROADWELL-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherqq: @@ -3001,7 +3001,7 @@ declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64> define <4 x i64> @test_pgatherqq_ymm(<4 x i64> %a0, i8 *%a1, <4 x i64> %a2, <4 x i64> %a3) { ; GENERIC-LABEL: test_pgatherqq_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 +; GENERIC-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pgatherqq_ymm: @@ -3011,7 +3011,7 @@ define <4 x i64> @test_pgatherqq_ymm(<4 x i64> %a0, i8 *%a1, <4 x i64> %a2, <4 x ; ; BROADWELL-LABEL: test_pgatherqq_ymm: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 +; BROADWELL-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pgatherqq_ymm: diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index e62367d25604b..32b243d2ebad8 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -63,8 +63,9 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test3: @@ -85,8 +86,9 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b ret <8 x i1> %c @@ -271,8 +273,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: movb $85, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; @@ -306,8 +309,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: movb $85, %al ; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 0487b56072942..54342a10b95bb 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -1468,11 +1468,9 @@ define <16 x double> @sbto16f64(<16 x double> %a) { ; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm1, %ymm1 +; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVLDQ-NEXT: retq ; @@ -1519,8 +1517,7 @@ define <8 x double> @sbto8f64(<8 x double> %a) { ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: retq ; @@ -1559,8 +1556,7 @@ define <8 x float> @sbto8f32(<8 x float> %a) { ; NOVLDQ-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; NOVLDQ-NEXT: retq ; @@ -1862,14 +1858,12 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVL-NEXT: movq {{.*}}(%rip), %rax -; NOVL-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} -; NOVL-NEXT: vpmovqd %zmm0, %ymm0 -; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: movl {{.*}}(%rip), %eax +; NOVL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVL-NEXT: kshiftrw $8, %k1, %k1 -; NOVL-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z} -; NOVL-NEXT: vpmovqd %zmm1, %ymm1 -; NOVL-NEXT: vcvtudq2pd %ymm1, %zmm1 +; NOVL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; NOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto16f64: @@ -1894,10 +1888,8 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { ; NOVL-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; NOVL-NEXT: vpmovqd %zmm0, %ymm0 -; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; NOVL-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto8f32: @@ -1918,9 +1910,8 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { ; NOVL-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; NOVL-NEXT: vpmovqd %zmm0, %ymm0 -; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto8f64: diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 80127f66bdfe6..50de773af0011 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -5,6 +5,59 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c +define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_kunpackb: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 +; X32-NEXT: kunpckbw %k0, %k1, %k1 +; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackb: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckbw %k0, %k1, %k1 +; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__A to <16 x i32> + %1 = bitcast <8 x i64> %__B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %4 = bitcast <8 x i64> %__C to <16 x i32> + %5 = bitcast <8 x i64> %__D to <16 x i32> + %6 = icmp ne <16 x i32> %4, %5 + %7 = bitcast <16 x i1> %6 to i16 + %8 = and i16 %7, 255 + %shl.i = shl i16 %3, 8 + %or.i = or i16 %8, %shl.i + %9 = bitcast <8 x i64> %__E to <16 x i32> + %10 = bitcast <8 x i64> %__F to <16 x i32> + %11 = icmp ne <16 x i32> %9, %10 + %12 = bitcast i16 %or.i to <16 x i1> + %13 = and <16 x i1> %11, %12 + %14 = bitcast <16 x i1> %13 to i16 + ret i16 %14 +} + define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { ; X32-LABEL: test_mm512_shuffle_f32x4: ; X32: # %bb.0: # %entry diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index a90652735b55f..95fe89672c609 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,7 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +define i16 @unpckbw_test(i16 %a0, i16 %a1) { +; CHECK-LABEL: unpckbw_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: shll $8, %esi +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: ## kill: %ax %ax %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpbroadcastd %edi, %zmm1 diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 6829b6f378ca3..bf0f6e788170c 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -96,21 +96,6 @@ define i16 @test_kor(i16 %a0, i16 %a1) { ret i16 %t2 } -declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone - -define i16 @unpckbw_test(i16 %a0, i16 %a1) { -; CHECK-LABEL: unpckbw_test: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: kunpckbw %k1, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: ## kill: %ax %ax %eax -; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) - ret i16 %res -} - declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone ; TODO: the two kxnor instructions here a no op and should be elimintaed, ; probably by FoldConstantArithmetic in SelectionDAG. diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index ab634d7d8d5e3..6feeb74e67ca1 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1175,8 +1175,9 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: kshiftlw $7, %k0, %k0 ; KNL-NEXT: korw %k0, %k1, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1241,8 +1242,9 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1 ; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 ; AVX512DQ-NEXT: korb %k0, %k1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: ## kill: %xmm0 %xmm0 %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -2572,9 +2574,8 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: ## kill: %xmm0 %xmm0 %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: ## kill: %xmm0 %xmm0 %zmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -2588,9 +2589,8 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: ## kill: %xmm0 %xmm0 %ymm0 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: ## kill: %xmm0 %xmm0 %zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 3ef36e7e5be7e..01931f876c97e 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -401,7 +401,7 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma ; GENERIC-LABEL: vpaddd_mask_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -421,7 +421,7 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m ; GENERIC-LABEL: vpaddd_maskz_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -441,7 +441,7 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ; GENERIC-LABEL: vpaddd_mask_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -462,7 +462,7 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) ; GENERIC-LABEL: vpaddd_mask_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -482,7 +482,7 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ; GENERIC-LABEL: vpaddd_maskz_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -503,7 +503,7 @@ define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) ; GENERIC-LABEL: vpaddd_maskz_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -718,7 +718,7 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ; GENERIC-LABEL: test_mask_vaddps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -740,7 +740,7 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vmulps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -760,7 +760,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vminps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -781,7 +781,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vminpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -802,7 +802,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vmaxps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -823,7 +823,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vmaxpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -844,7 +844,7 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vsubps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -864,7 +864,7 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x ; GENERIC-LABEL: test_mask_vdivps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -884,7 +884,7 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x d ; GENERIC-LABEL: test_mask_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -904,7 +904,7 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i6 ; GENERIC-LABEL: test_maskz_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -924,7 +924,7 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, < ; GENERIC-LABEL: test_mask_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -945,7 +945,7 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, < ; GENERIC-LABEL: test_maskz_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -984,7 +984,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ; GENERIC-LABEL: test_mask_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1010,7 +1010,7 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ; GENERIC-LABEL: test_maskz_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1163,14 +1163,14 @@ l2: define i32 @test3(float %a, float %b) { ; GENERIC-LABEL: test3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovd %k0, %eax ; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test3: ; SKX: # %bb.0: -; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] @@ -1579,7 +1579,7 @@ define <16 x i8> @f32to16uc(<16 x float> %f) { ; GENERIC-LABEL: f32to16uc: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovdb %zmm0, %xmm0 +; GENERIC-NEXT: vpmovdb %zmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1597,7 +1597,7 @@ define <16 x i16> @f32to16us(<16 x float> %f) { ; GENERIC-LABEL: f32to16us: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovdw %zmm0, %ymm0 +; GENERIC-NEXT: vpmovdw %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to16us: @@ -1655,7 +1655,7 @@ define <8 x i16> @f64to8us(<8 x double> %f) { ; GENERIC-LABEL: f64to8us: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 +; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1673,7 +1673,7 @@ define <8 x i8> @f64to8uc(<8 x double> %f) { ; GENERIC-LABEL: f64to8uc: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 +; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2317,12 +2317,12 @@ define i32 @fptosi(float %a) nounwind { define i32 @fptoui(float %a) nounwind { ; GENERIC-LABEL: fptoui: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vcvttss2usi %xmm0, %eax +; GENERIC-NEXT: vcvttss2usi %xmm0, %eax # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fptoui: ; SKX: # %bb.0: -; SKX-NEXT: vcvttss2usi %xmm0, %eax +; SKX-NEXT: vcvttss2usi %xmm0, %eax # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui float %a to i32 ret i32 %b @@ -2331,7 +2331,7 @@ define i32 @fptoui(float %a) nounwind { define float @uitof32(i32 %a) nounwind { ; GENERIC-LABEL: uitof32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 +; GENERIC-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uitof32: @@ -2345,7 +2345,7 @@ define float @uitof32(i32 %a) nounwind { define double @uitof64(i32 %a) nounwind { ; GENERIC-LABEL: uitof64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 +; GENERIC-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uitof64: @@ -2360,7 +2360,7 @@ define <16 x float> @sbto16f32(<16 x i32> %a) { ; GENERIC-LABEL: sbto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2380,7 +2380,7 @@ define <16 x float> @sbto16f32(<16 x i32> %a) { define <16 x float> @scto16f32(<16 x i8> %a) { ; GENERIC-LABEL: scto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2396,7 +2396,7 @@ define <16 x float> @scto16f32(<16 x i8> %a) { define <16 x float> @ssto16f32(<16 x i16> %a) { ; GENERIC-LABEL: ssto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2448,7 +2448,7 @@ define <8 x double> @scto8f64(<8 x i8> %a) { define <16 x double> @scto16f64(<16 x i8> %a) { ; GENERIC-LABEL: scto16f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm1 +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] @@ -2621,7 +2621,7 @@ define <2 x double> @sbto2f64(<2 x double> %a) { define <16 x float> @ucto16f32(<16 x i8> %a) { ; GENERIC-LABEL: ucto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2655,7 +2655,7 @@ define <8 x double> @ucto8f64(<8 x i8> %a) { define <16 x float> @swto16f32(<16 x i16> %a) { ; GENERIC-LABEL: swto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2687,7 +2687,7 @@ define <8 x double> @swto8f64(<8 x i16> %a) { define <16 x double> @swto16f64(<16 x i16> %a) { ; GENERIC-LABEL: swto16f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm1 +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] @@ -2707,7 +2707,7 @@ define <16 x double> @swto16f64(<16 x i16> %a) { define <16 x double> @ucto16f64(<16 x i8> %a) { ; GENERIC-LABEL: ucto16f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] @@ -2727,7 +2727,7 @@ define <16 x double> @ucto16f64(<16 x i8> %a) { define <16 x float> @uwto16f32(<16 x i16> %a) { ; GENERIC-LABEL: uwto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2759,7 +2759,7 @@ define <8 x double> @uwto8f64(<8 x i16> %a) { define <16 x double> @uwto16f64(<16 x i16> %a) { ; GENERIC-LABEL: uwto16f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] @@ -2813,7 +2813,7 @@ define <16 x double> @sito16f64(<16 x i32> %a) { define <16 x float> @usto16f32(<16 x i16> %a) { ; GENERIC-LABEL: usto16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2830,7 +2830,7 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2851,7 +2851,7 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] ; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] @@ -2880,7 +2880,7 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2901,7 +2901,7 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2922,7 +2922,7 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2943,7 +2943,7 @@ define <4 x double> @ubto4f64(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2965,7 +2965,7 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2988,7 +2988,7 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] -; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3011,7 +3011,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x16: @@ -3031,7 +3031,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x16: @@ -3052,7 +3052,7 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x16: @@ -3072,7 +3072,7 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x16: @@ -3106,7 +3106,7 @@ define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x16_mask: @@ -3139,7 +3139,7 @@ define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x16_mask: @@ -3158,7 +3158,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8mem_to_32x16: @@ -3178,7 +3178,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 -; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8mem_to_32x16: @@ -3196,7 +3196,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: zext_32x8_to_32x16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_32x16: @@ -3212,7 +3212,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_32x16_mask: @@ -3229,7 +3229,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: sext_32x8_to_32x16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 +; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8_to_32x16: @@ -3245,7 +3245,7 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 -; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8_to_32x16_mask: @@ -3264,7 +3264,7 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x32: @@ -3284,7 +3284,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x32: @@ -3304,7 +3304,7 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x32: @@ -3324,7 +3324,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x32: @@ -3344,7 +3344,7 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x32: @@ -3364,7 +3364,7 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x32: @@ -3384,7 +3384,7 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x32_mask: @@ -3403,7 +3403,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x32_mask: @@ -3420,7 +3420,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { ; GENERIC-LABEL: zext_16x8_to_16x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x32: @@ -3434,7 +3434,7 @@ define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { ; GENERIC-LABEL: sext_16x8_to_16x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x32: @@ -3450,7 +3450,7 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x8mem_to_2x64: @@ -3469,7 +3469,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x8mem_to_2x64mask: @@ -3503,7 +3503,7 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x64: @@ -3523,7 +3523,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x64mask: @@ -3558,7 +3558,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x64: @@ -3578,7 +3578,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64mask: @@ -3596,7 +3596,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64: @@ -3613,7 +3613,7 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x32: @@ -3633,7 +3633,7 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x32mask: @@ -3669,7 +3669,7 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x32: @@ -3689,7 +3689,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x32mask: @@ -3724,7 +3724,7 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x32mask: @@ -3757,7 +3757,7 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16mem_to_16x32: @@ -3777,7 +3777,7 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32mask: @@ -3795,7 +3795,7 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_16x16mem_to_16x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32: @@ -3811,7 +3811,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_16x32mask: @@ -3828,7 +3828,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone { ; GENERIC-LABEL: zext_16x16_to_16x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_16x32: @@ -3844,7 +3844,7 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x16mem_to_2x64: @@ -3864,7 +3864,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x16mem_to_2x64mask: @@ -3899,7 +3899,7 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x64: @@ -3919,7 +3919,7 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x64mask: @@ -3954,7 +3954,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x64: @@ -3974,7 +3974,7 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64mask: @@ -3992,7 +3992,7 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64: @@ -4009,7 +4009,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x64mask: @@ -4026,7 +4026,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone { ; GENERIC-LABEL: zext_8x16_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x64: @@ -4042,7 +4042,7 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero +; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x32mem_to_2x64: @@ -4062,7 +4062,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x32mem_to_2x64mask: @@ -4097,7 +4097,7 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x32mem_to_4x64: @@ -4117,7 +4117,7 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x32mem_to_4x64mask: @@ -4166,7 +4166,7 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x32_to_4x64mask: @@ -4185,7 +4185,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32mem_to_8x64: @@ -4205,7 +4205,7 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 -; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64mask: @@ -4223,7 +4223,7 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x32mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64: @@ -4238,7 +4238,7 @@ define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone { ; GENERIC-LABEL: sext_8x32_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxdq %ymm0, %zmm0 +; GENERIC-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32_to_8x64: @@ -4254,7 +4254,7 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32_to_8x64mask: @@ -4423,7 +4423,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4474,7 +4474,7 @@ define i16 @trunc_i32_to_i1(i32 %a) { define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4493,7 +4493,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_16i1_16i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4510,7 +4510,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4527,7 +4527,7 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { ; GENERIC-LABEL: extload_v8i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4586,7 +4586,7 @@ define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 -; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask: @@ -4694,7 +4694,7 @@ define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) { define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; GENERIC-LABEL: zext_64xi1_to_64xi8: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4711,7 +4711,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4728,7 +4728,7 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { ; GENERIC-LABEL: zext_16xi1_to_16xi16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4746,7 +4746,7 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi8: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4766,7 +4766,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4789,7 +4789,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5081,7 +5081,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; GENERIC-LABEL: vpandd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandd: @@ -5101,7 +5101,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno ; GENERIC-LABEL: vpandnd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnd: @@ -5123,7 +5123,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ; GENERIC-LABEL: vpord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpord: @@ -5143,7 +5143,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; GENERIC-LABEL: vpxord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxord: @@ -5163,7 +5163,7 @@ define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone s ; GENERIC-LABEL: vpandq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandq: @@ -5182,7 +5182,7 @@ define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ; GENERIC-LABEL: vpandnq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnq: @@ -5202,7 +5202,7 @@ define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ss ; GENERIC-LABEL: vporq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vporq: @@ -5221,7 +5221,7 @@ define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone s ; GENERIC-LABEL: vpxorq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxorq: @@ -6402,7 +6402,7 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6423,7 +6423,7 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test33: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6444,7 +6444,7 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test34: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6465,7 +6465,7 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test35: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6486,7 +6486,7 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test36: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6507,7 +6507,7 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test37: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6528,7 +6528,7 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test38: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6549,7 +6549,7 @@ define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test39: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6947,7 +6947,7 @@ define i8 @shuf_test1(i16 %v) nounwind { define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -6973,7 +6973,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -7001,7 +7001,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftlw $10, %k0, %k0 ; GENERIC-NEXT: kshiftrw $15, %k0, %k0 ; GENERIC-NEXT: kmovd %k0, %eax @@ -7055,8 +7055,8 @@ entry: define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { ; GENERIC-LABEL: test4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: kandnw %k0, %k1, %k0 ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 ; GENERIC-NEXT: vzeroupper @@ -7080,8 +7080,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { ; GENERIC-LABEL: vcmp_test5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: kandnw %k1, %k0, %k0 ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7150,12 +7150,12 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: -; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB386_1: -; GENERIC-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; GENERIC-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7330,7 +7330,7 @@ define <64 x i8> @vmov_test16(i64 %x) { ; GENERIC-NEXT: vpmovm2b %k0, %zmm1 ; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 -; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 @@ -7370,7 +7370,7 @@ define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { ; GENERIC-NEXT: vpmovm2b %k0, %zmm1 ; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 -; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 @@ -8079,7 +8079,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v8i1_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovb %k0, %eax ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper @@ -8105,7 +8105,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v16i1_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovw %k0, %eax ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper @@ -8339,7 +8339,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m ; GENERIC-LABEL: _ss16xfloat_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8362,7 +8362,7 @@ define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8399,7 +8399,7 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 ; GENERIC-LABEL: _ss16xfloat_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8421,7 +8421,7 @@ define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) ; GENERIC-LABEL: _ss16xfloat_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8458,7 +8458,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m ; GENERIC-LABEL: _sd8xdouble_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8481,7 +8481,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8518,7 +8518,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 ; GENERIC-LABEL: _sd8xdouble_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8540,7 +8540,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) ; GENERIC-LABEL: _sd8xdouble_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index db7b9cb25029e..6c351d2103ded 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -24,7 +24,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -48,7 +48,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -69,7 +69,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -93,7 +93,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -114,7 +114,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -138,7 +138,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -174,7 +174,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -198,7 +198,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -235,7 +235,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -258,7 +258,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -281,7 +281,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -304,7 +304,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -327,7 +327,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -350,7 +350,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -389,7 +389,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -412,7 +412,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -450,7 +450,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -474,7 +474,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -495,7 +495,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -519,7 +519,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -540,7 +540,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -564,7 +564,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -600,7 +600,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -624,7 +624,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -661,7 +661,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -684,7 +684,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -707,7 +707,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -730,7 +730,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -753,7 +753,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -776,7 +776,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -815,7 +815,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -838,7 +838,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -876,7 +876,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -900,7 +900,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -921,7 +921,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -945,7 +945,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -966,7 +966,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -990,7 +990,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1026,7 +1026,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1050,7 +1050,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1087,7 +1087,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1110,7 +1110,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1133,7 +1133,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1156,7 +1156,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1179,7 +1179,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1202,7 +1202,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1241,7 +1241,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1264,7 +1264,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1302,7 +1302,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1326,7 +1326,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1347,7 +1347,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1371,7 +1371,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1392,7 +1392,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1416,7 +1416,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1452,7 +1452,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1476,7 +1476,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1513,7 +1513,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1536,7 +1536,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1559,7 +1559,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1582,7 +1582,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1605,7 +1605,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1628,7 +1628,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1667,7 +1667,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1690,7 +1690,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1725,7 +1725,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC-LABEL: test_masked_4xi64_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1747,7 +1747,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1766,7 +1766,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC-LABEL: test_masked_4xi64_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1788,7 +1788,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1807,7 +1807,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC-LABEL: test_masked_4xi64_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1829,7 +1829,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1861,7 +1861,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC-LABEL: test_masked_4xi64_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1883,7 +1883,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask ; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1916,7 +1916,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1937,7 +1937,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> % ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1958,7 +1958,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1979,7 +1979,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> % ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2000,7 +2000,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2021,7 +2021,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> % ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2056,7 +2056,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2077,7 +2077,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> % ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2114,7 +2114,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2138,7 +2138,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2158,7 +2158,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2180,7 +2180,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2200,7 +2200,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2224,7 +2224,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2257,7 +2257,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2279,7 +2279,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2299,7 +2299,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2323,7 +2323,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2343,7 +2343,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2365,7 +2365,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2400,7 +2400,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2424,7 +2424,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2444,7 +2444,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2466,7 +2466,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> % ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2502,7 +2502,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2525,7 +2525,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2547,7 +2547,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2568,7 +2568,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i6 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2590,7 +2590,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2613,7 +2613,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2649,7 +2649,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2670,7 +2670,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i6 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2692,7 +2692,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2715,7 +2715,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2737,7 +2737,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2758,7 +2758,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i6 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2796,7 +2796,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2819,7 +2819,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2841,7 +2841,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2862,7 +2862,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i6 ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2899,7 +2899,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2923,7 +2923,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2944,7 +2944,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2968,7 +2968,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2989,7 +2989,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3013,7 +3013,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3049,7 +3049,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3073,7 +3073,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3110,7 +3110,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3133,7 +3133,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3156,7 +3156,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3179,7 +3179,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3202,7 +3202,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3225,7 +3225,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3264,7 +3264,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3287,7 +3287,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3325,7 +3325,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3349,7 +3349,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3370,7 +3370,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3394,7 +3394,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3415,7 +3415,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3439,7 +3439,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3475,7 +3475,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3499,7 +3499,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3536,7 +3536,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3559,7 +3559,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3582,7 +3582,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3605,7 +3605,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3628,7 +3628,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3651,7 +3651,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3690,7 +3690,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3713,7 +3713,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3748,7 +3748,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou ; GENERIC-LABEL: test_masked_4xdouble_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3770,7 +3770,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3789,7 +3789,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou ; GENERIC-LABEL: test_masked_4xdouble_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3811,7 +3811,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3830,7 +3830,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou ; GENERIC-LABEL: test_masked_4xdouble_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3852,7 +3852,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3884,7 +3884,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou ; GENERIC-LABEL: test_masked_4xdouble_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3906,7 +3906,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3939,7 +3939,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3960,7 +3960,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3981,7 +3981,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4002,7 +4002,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4023,7 +4023,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4044,7 +4044,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4079,7 +4079,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4100,7 +4100,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4137,7 +4137,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4161,7 +4161,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4181,7 +4181,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4203,7 +4203,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4223,7 +4223,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4247,7 +4247,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4280,7 +4280,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4302,7 +4302,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4322,7 +4322,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4346,7 +4346,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4366,7 +4366,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4388,7 +4388,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4423,7 +4423,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4447,7 +4447,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4467,7 +4467,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4489,7 +4489,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4525,7 +4525,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4548,7 +4548,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4570,7 +4570,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4591,7 +4591,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4613,7 +4613,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4636,7 +4636,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4672,7 +4672,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4693,7 +4693,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4715,7 +4715,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4738,7 +4738,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4760,7 +4760,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4781,7 +4781,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4819,7 +4819,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4842,7 +4842,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4864,7 +4864,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4885,7 +4885,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4919,7 +4919,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC-LABEL: test_masked_16xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4941,7 +4941,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4960,7 +4960,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC-LABEL: test_masked_16xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4982,7 +4982,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5001,7 +5001,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC-LABEL: test_masked_16xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5023,7 +5023,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5055,7 +5055,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC-LABEL: test_masked_16xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5077,7 +5077,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5113,7 +5113,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5136,7 +5136,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5159,7 +5159,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5182,7 +5182,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5205,7 +5205,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5228,7 +5228,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5267,7 +5267,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5290,7 +5290,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5325,7 +5325,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC-LABEL: test_masked_32xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5347,7 +5347,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5366,7 +5366,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC-LABEL: test_masked_32xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5388,7 +5388,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5407,7 +5407,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC-LABEL: test_masked_32xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5429,7 +5429,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5461,7 +5461,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC-LABEL: test_masked_32xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5483,7 +5483,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5519,7 +5519,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5542,7 +5542,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5565,7 +5565,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5588,7 +5588,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5611,7 +5611,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5634,7 +5634,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5673,7 +5673,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5696,7 +5696,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5731,7 +5731,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-LABEL: test_masked_64xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5753,7 +5753,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5772,7 +5772,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-LABEL: test_masked_64xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5794,7 +5794,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5813,7 +5813,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-LABEL: test_masked_64xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5835,7 +5835,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5867,7 +5867,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-LABEL: test_masked_64xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5889,7 +5889,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5925,7 +5925,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5948,7 +5948,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5971,7 +5971,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5994,7 +5994,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6017,7 +6017,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6040,7 +6040,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6079,7 +6079,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6102,7 +6102,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> % ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6137,7 +6137,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6159,7 +6159,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6178,7 +6178,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6200,7 +6200,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6219,7 +6219,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6241,7 +6241,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6273,7 +6273,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6295,7 +6295,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6314,7 +6314,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6336,7 +6336,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6355,7 +6355,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6377,7 +6377,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6409,7 +6409,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6431,7 +6431,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6450,7 +6450,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6472,7 +6472,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6505,7 +6505,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6526,7 +6526,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6547,7 +6547,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6568,7 +6568,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6589,7 +6589,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6610,7 +6610,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6645,7 +6645,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6666,7 +6666,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6687,7 +6687,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6708,7 +6708,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6729,7 +6729,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6750,7 +6750,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6785,7 +6785,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6806,7 +6806,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6827,7 +6827,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6848,7 +6848,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6882,7 +6882,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6904,7 +6904,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6923,7 +6923,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6945,7 +6945,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6964,7 +6964,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6986,7 +6986,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7018,7 +7018,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7040,7 +7040,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7059,7 +7059,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7081,7 +7081,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7100,7 +7100,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7122,7 +7122,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7154,7 +7154,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 ; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7176,7 +7176,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7195,7 +7195,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> ; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7217,7 +7217,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i1 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7250,7 +7250,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7271,7 +7271,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7292,7 +7292,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7313,7 +7313,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7334,7 +7334,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7355,7 +7355,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7390,7 +7390,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7411,7 +7411,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7432,7 +7432,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7453,7 +7453,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7474,7 +7474,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7495,7 +7495,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7530,7 +7530,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7551,7 +7551,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7572,7 +7572,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7593,7 +7593,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7627,7 +7627,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7649,7 +7649,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7668,7 +7668,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7690,7 +7690,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7709,7 +7709,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7731,7 +7731,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7763,7 +7763,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7785,7 +7785,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7804,7 +7804,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7826,7 +7826,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7845,7 +7845,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7867,7 +7867,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7899,7 +7899,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 ; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7921,7 +7921,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7940,7 +7940,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> ; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7962,7 +7962,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i1 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7995,7 +7995,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8016,7 +8016,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8037,7 +8037,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8058,7 +8058,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8079,7 +8079,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8100,7 +8100,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8135,7 +8135,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8156,7 +8156,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8177,7 +8177,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8198,7 +8198,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8220,7 +8220,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8243,7 +8243,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8279,7 +8279,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8300,7 +8300,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8321,7 +8321,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8342,7 +8342,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8376,7 +8376,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8398,7 +8398,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8417,7 +8417,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8439,7 +8439,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8458,7 +8458,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8480,7 +8480,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8512,7 +8512,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8534,7 +8534,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8567,7 +8567,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8588,7 +8588,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8609,7 +8609,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8630,7 +8630,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8651,7 +8651,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8672,7 +8672,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8707,7 +8707,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8728,7 +8728,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8762,7 +8762,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test2_masked_8xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8784,7 +8784,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mas ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8803,7 +8803,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test2_masked_8xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8825,7 +8825,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mas ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8844,7 +8844,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test2_masked_8xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8866,7 +8866,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mas ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8898,7 +8898,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-LABEL: test2_masked_8xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8920,7 +8920,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mas ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8953,7 +8953,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8974,7 +8974,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8995,7 +8995,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9016,7 +9016,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9037,7 +9037,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9058,7 +9058,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9093,7 +9093,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9114,7 +9114,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9148,7 +9148,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v ; GENERIC-LABEL: test2_masked_16xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9170,7 +9170,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9189,7 +9189,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v ; GENERIC-LABEL: test2_masked_16xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9211,7 +9211,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9230,7 +9230,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v ; GENERIC-LABEL: test2_masked_16xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9252,7 +9252,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9284,7 +9284,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v ; GENERIC-LABEL: test2_masked_16xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9306,7 +9306,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9339,7 +9339,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9360,7 +9360,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9381,7 +9381,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9402,7 +9402,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9423,7 +9423,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9444,7 +9444,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9479,7 +9479,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32 ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9500,7 +9500,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9534,7 +9534,7 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9556,7 +9556,7 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9575,7 +9575,7 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9597,7 +9597,7 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9616,7 +9616,7 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9638,7 +9638,7 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9670,7 +9670,7 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9692,7 +9692,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9725,7 +9725,7 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9748,7 +9748,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9769,7 +9769,7 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9792,7 +9792,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9813,7 +9813,7 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9836,7 +9836,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9871,7 +9871,7 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9894,7 +9894,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9928,7 +9928,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9950,7 +9950,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9969,7 +9969,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -9991,7 +9991,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10010,7 +10010,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10032,7 +10032,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10064,7 +10064,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x ; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10086,7 +10086,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10119,7 +10119,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10142,7 +10142,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10163,7 +10163,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10186,7 +10186,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10207,7 +10207,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10230,7 +10230,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10265,7 +10265,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1 ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10288,7 +10288,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10322,7 +10322,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10344,7 +10344,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10363,7 +10363,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10385,7 +10385,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10404,7 +10404,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10426,7 +10426,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10458,7 +10458,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10480,7 +10480,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10513,7 +10513,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10536,7 +10536,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10557,7 +10557,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10580,7 +10580,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10601,7 +10601,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10624,7 +10624,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10659,7 +10659,7 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10682,7 +10682,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10716,7 +10716,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10738,7 +10738,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10757,7 +10757,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10779,7 +10779,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10798,7 +10798,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10820,7 +10820,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10852,7 +10852,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d ; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10874,7 +10874,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10907,7 +10907,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10930,7 +10930,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10951,7 +10951,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -10974,7 +10974,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10995,7 +10995,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11018,7 +11018,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11053,7 +11053,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11076,7 +11076,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11110,7 +11110,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11132,7 +11132,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11151,7 +11151,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11173,7 +11173,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11192,7 +11192,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11214,7 +11214,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11246,7 +11246,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11268,7 +11268,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11301,7 +11301,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11324,7 +11324,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11345,7 +11345,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11368,7 +11368,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11389,7 +11389,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11412,7 +11412,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11447,7 +11447,7 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11470,7 +11470,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11504,7 +11504,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % ; GENERIC-LABEL: test_16xi32_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11526,7 +11526,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11545,7 +11545,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % ; GENERIC-LABEL: test_16xi32_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11567,7 +11567,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11586,7 +11586,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % ; GENERIC-LABEL: test_16xi32_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11608,7 +11608,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11640,7 +11640,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % ; GENERIC-LABEL: test_16xi32_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11662,7 +11662,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11695,7 +11695,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11718,7 +11718,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11739,7 +11739,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11762,7 +11762,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11783,7 +11783,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11806,7 +11806,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11841,7 +11841,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11864,7 +11864,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11898,7 +11898,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11920,7 +11920,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11939,7 +11939,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -11961,7 +11961,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11980,7 +11980,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12002,7 +12002,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12034,7 +12034,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12056,7 +12056,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12089,7 +12089,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12112,7 +12112,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12133,7 +12133,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12156,7 +12156,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12177,7 +12177,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12200,7 +12200,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12235,7 +12235,7 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12258,7 +12258,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12292,7 +12292,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12314,7 +12314,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12333,7 +12333,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12355,7 +12355,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12374,7 +12374,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12396,7 +12396,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12428,7 +12428,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-LABEL: test_8xi64_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12450,7 +12450,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12483,7 +12483,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12506,7 +12506,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12527,7 +12527,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12550,7 +12550,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12571,7 +12571,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12594,7 +12594,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12629,7 +12629,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12652,7 +12652,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12686,7 +12686,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12708,7 +12708,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12727,7 +12727,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12749,7 +12749,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12768,7 +12768,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12790,7 +12790,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12822,7 +12822,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12844,7 +12844,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12877,7 +12877,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12900,7 +12900,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %v ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12921,7 +12921,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12944,7 +12944,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %v ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12965,7 +12965,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -12988,7 +12988,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %v ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13023,7 +13023,7 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13046,7 +13046,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %v ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13080,7 +13080,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13102,7 +13102,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13121,7 +13121,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13143,7 +13143,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13162,7 +13162,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13184,7 +13184,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13216,7 +13216,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13238,7 +13238,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13271,7 +13271,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13294,7 +13294,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %v ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13315,7 +13315,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13338,7 +13338,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %v ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13359,7 +13359,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13382,7 +13382,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %v ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13417,7 +13417,7 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13440,7 +13440,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %v ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13474,7 +13474,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13496,7 +13496,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13515,7 +13515,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13537,7 +13537,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13556,7 +13556,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13578,7 +13578,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13610,7 +13610,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, < ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13632,7 +13632,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13665,7 +13665,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13688,7 +13688,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13709,7 +13709,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13732,7 +13732,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13753,7 +13753,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13776,7 +13776,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13811,7 +13811,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec ; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13834,7 +13834,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13868,7 +13868,7 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, < ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13890,7 +13890,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %ve ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13909,7 +13909,7 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, < ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13931,7 +13931,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %ve ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13964,7 +13964,7 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -13987,7 +13987,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14008,7 +14008,7 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14031,7 +14031,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14065,7 +14065,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14087,7 +14087,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14106,7 +14106,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14128,7 +14128,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14147,7 +14147,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14169,7 +14169,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14201,7 +14201,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, < ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14223,7 +14223,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14256,7 +14256,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14279,7 +14279,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14300,7 +14300,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14323,7 +14323,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14344,7 +14344,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14367,7 +14367,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14402,7 +14402,7 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec ; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14425,7 +14425,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14459,7 +14459,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14481,7 +14481,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14500,7 +14500,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14522,7 +14522,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14541,7 +14541,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14563,7 +14563,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14595,7 +14595,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, < ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14617,7 +14617,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14650,7 +14650,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14673,7 +14673,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14694,7 +14694,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14717,7 +14717,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14738,7 +14738,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14761,7 +14761,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14796,7 +14796,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec ; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14819,7 +14819,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14853,7 +14853,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14875,7 +14875,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14894,7 +14894,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14916,7 +14916,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14935,7 +14935,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -14957,7 +14957,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14989,7 +14989,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15011,7 +15011,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1 ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15044,7 +15044,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15067,7 +15067,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> % ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15088,7 +15088,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15111,7 +15111,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> % ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15132,7 +15132,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15155,7 +15155,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> % ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15190,7 +15190,7 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15213,7 +15213,7 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> % ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15247,7 +15247,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15269,7 +15269,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15288,7 +15288,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15310,7 +15310,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15329,7 +15329,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15351,7 +15351,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15383,7 +15383,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15405,7 +15405,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1 ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15438,7 +15438,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15461,7 +15461,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> % ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15482,7 +15482,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15505,7 +15505,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> % ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15526,7 +15526,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15549,7 +15549,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> % ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15584,7 +15584,7 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, ; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15607,7 +15607,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> % ; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15641,7 +15641,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15663,7 +15663,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %v ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15682,7 +15682,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15704,7 +15704,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %v ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15723,7 +15723,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15745,7 +15745,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %v ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15777,7 +15777,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15799,7 +15799,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %v ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15832,7 +15832,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15855,7 +15855,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15876,7 +15876,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15899,7 +15899,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15920,7 +15920,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -15943,7 +15943,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -15978,7 +15978,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve ; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16001,7 +16001,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float ; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16035,7 +16035,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16057,7 +16057,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %v ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16076,7 +16076,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16098,7 +16098,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %v ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16131,7 +16131,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16154,7 +16154,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16175,7 +16175,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16198,7 +16198,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16232,7 +16232,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16254,7 +16254,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %v ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16273,7 +16273,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16295,7 +16295,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %v ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16314,7 +16314,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16336,7 +16336,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %v ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16368,7 +16368,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16390,7 +16390,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %v ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16423,7 +16423,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16446,7 +16446,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16467,7 +16467,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16490,7 +16490,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16511,7 +16511,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16534,7 +16534,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16569,7 +16569,7 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %ve ; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16592,7 +16592,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double ; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16626,7 +16626,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16648,7 +16648,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %v ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16667,7 +16667,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16689,7 +16689,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %v ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16708,7 +16708,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16730,7 +16730,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %v ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16762,7 +16762,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: vmovapd %zmm2, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16784,7 +16784,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %v ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16817,7 +16817,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16840,7 +16840,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16861,7 +16861,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16884,7 +16884,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16905,7 +16905,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16928,7 +16928,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -16963,7 +16963,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve ; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -16986,7 +16986,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double ; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 91c3b73a95904..4c650903e4716 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -100,7 +100,6 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) { ; CHECK-NEXT: kshiftlb $2, %k0, %k0 ; CHECK-NEXT: kshiftrb $2, %k0, %k0 ; CHECK-NEXT: korb %k1, %k0, %k0 -; CHECK-NEXT: kunpckbw %k0, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: retq @@ -119,8 +118,6 @@ define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) { ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k0 ; CHECK-NEXT: korb %k1, %k0, %k0 -; CHECK-NEXT: kunpckbw %k0, %k0, %k0 -; CHECK-NEXT: kunpckwd %k0, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 887ec006956cd..8bec292283b82 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -874,8 +874,8 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ; KNL-NEXT: kxnorw %k1, %k0, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: ## kill: %ymm0 %ymm0 %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test28: diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index cd6f70b36ff55..a0b13fa16b1b9 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -4,6 +4,117 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c +define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { +; X32-LABEL: test_mm512_kunpackd: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vmovdqa64 72(%ebp), %zmm4 +; X32-NEXT: vmovdqa64 8(%ebp), %zmm5 +; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0 +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X32-NEXT: kunpckdq %k0, %k1, %k1 +; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1} +; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackd: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 +; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckdq %k0, %k1, %k1 +; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovq %k0, %rax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__B to <64 x i8> + %1 = bitcast <8 x i64> %__A to <64 x i8> + %2 = icmp ne <64 x i8> %0, %1 + %3 = bitcast <64 x i1> %2 to i64 + %4 = bitcast <8 x i64> %__C to <64 x i8> + %5 = bitcast <8 x i64> %__D to <64 x i8> + %6 = icmp ne <64 x i8> %4, %5 + %7 = bitcast <64 x i1> %6 to i64 + %and.i = and i64 %7, 4294967295 + %shl.i = shl i64 %3, 32 + %or.i = or i64 %and.i, %shl.i + %8 = bitcast <8 x i64> %__E to <64 x i8> + %9 = bitcast <8 x i64> %__F to <64 x i8> + %10 = icmp ne <64 x i8> %8, %9 + %11 = bitcast i64 %or.i to <64 x i1> + %12 = and <64 x i1> %10, %11 + %13 = bitcast <64 x i1> %12 to i64 + ret i64 %13 +} + +define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { +; X32-LABEL: test_mm512_kunpackw: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 +; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 +; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1 +; X32-NEXT: kunpckwd %k0, %k1, %k1 +; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1} +; X32-NEXT: kmovd %k0, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kunpackw: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 +; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1 +; X64-NEXT: kunpckwd %k0, %k1, %k1 +; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1} +; X64-NEXT: kmovd %k0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__B to <32 x i16> + %1 = bitcast <8 x i64> %__A to <32 x i16> + %2 = icmp ne <32 x i16> %0, %1 + %3 = bitcast <32 x i1> %2 to i32 + %4 = bitcast <8 x i64> %__C to <32 x i16> + %5 = bitcast <8 x i64> %__D to <32 x i16> + %6 = icmp ne <32 x i16> %4, %5 + %7 = bitcast <32 x i1> %6 to i32 + %and.i = and i32 %7, 65535 + %shl.i = shl i32 %3, 16 + %or.i = or i32 %and.i, %shl.i + %8 = bitcast <8 x i64> %__E to <32 x i16> + %9 = bitcast <8 x i64> %__F to <32 x i16> + %10 = icmp ne <32 x i16> %8, %9 + %11 = bitcast i32 %or.i to <32 x i1> + %12 = and <32 x i1> %10, %11 + %13 = bitcast <32 x i1> %12 to i32 + ret i32 %13 +} + + define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) { ; X32-LABEL: test_mm512_mask_set1_epi8: ; X32: # %bb.0: # %entry @@ -694,13 +805,13 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X32-NEXT: vpmovb2m %zmm0, %k0 -; X32-NEXT: vpmovm2b %k0, %zmm0 -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $30, %ecx ; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: vpmovm2b %k0, %zmm2 ; X32-NEXT: vpbroadcastw %xmm2, %xmm2 ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1422,13 +1533,13 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; X32-NEXT: vpmovb2m %zmm0, %k0 -; X32-NEXT: vpmovm2b %k0, %zmm0 -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $30, %ecx ; X32-NEXT: kmovd %ecx, %k0 +; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vpmovb2m %zmm0, %k1 +; X32-NEXT: vpmovm2b %k1, %zmm0 +; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X32-NEXT: vpmovm2b %k0, %zmm2 ; X32-NEXT: vpbroadcastw %xmm2, %xmm2 ; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index f420be32af06d..d739066b1da9d 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,6 +2,45 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 +declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) + +define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: movzwl %di, %eax +; AVX512BW-NEXT: shll $16, %esi +; AVX512BW-NEXT: orl %esi, %eax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: shll $16, %eax +; AVX512F-32-NEXT: orl %ecx, %eax +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) + ret i32 %res +} + +declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) + +define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: shlq $32, %rsi +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: xorl %eax, %eax +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) + ret i64 %res +} + declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 7b5cc5feff0c0..2fa7c2c5b8a8c 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1455,55 +1455,6 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> ret <8 x i64> %res2 } -declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) - -define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k0 -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: retl - %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) - ret i32 %res -} - -declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) - -define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kmovq %rsi, %k1 -; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: subl $12, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 -; AVX512F-32-NEXT: kmovq %k0, (%esp) -; AVX512F-32-NEXT: movl (%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl $12, %esp -; AVX512F-32-NEXT: retl - %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) - ret i64 %res -} - declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>) define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 53e8a5c064407..7969a9ff1df35 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -50208,8 +50208,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i6 ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax @@ -50274,8 +50273,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax @@ -50341,8 +50339,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -50581,8 +50578,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax @@ -50646,8 +50642,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax @@ -50712,8 +50707,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -50915,8 +50909,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -50966,8 +50959,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -51018,8 +51010,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -51219,8 +51210,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -51276,8 +51266,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -51334,8 +51323,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -55620,8 +55608,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i6 ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -55688,8 +55675,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -55757,8 +55743,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -56005,8 +55990,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -56072,8 +56056,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -56140,8 +56123,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -56351,8 +56333,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -56404,8 +56385,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -56458,8 +56438,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -56667,8 +56646,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -56726,8 +56704,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -56786,8 +56763,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index e0dd7f253e863..360ee6b15bedd 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -109,9 +109,8 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -166,8 +165,9 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX512F-LABEL: ext_i8_8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -372,8 +372,8 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX512F-LABEL: ext_i8_8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i8_8i32: diff --git a/test/CodeGen/X86/gpr-to-mask.ll b/test/CodeGen/X86/gpr-to-mask.ll index 1928a6c80f575..f558541416a0a 100644 --- a/test/CodeGen/X86/gpr-to-mask.ll +++ b/test/CodeGen/X86/gpr-to-mask.ll @@ -1,20 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-64 +; RUN: llc -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-32 define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { -; CHECK-LABEL: test_fcmp_storefloat: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB0_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k1 -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_2: # %else -; CHECK-NEXT: vcmpeqss %xmm5, %xmm4, %k1 -; CHECK-NEXT: .LBB0_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_fcmp_storefloat: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB0_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k1 +; X86-64-NEXT: jmp .LBB0_3 +; X86-64-NEXT: .LBB0_2: # %else +; X86-64-NEXT: vcmpeqss %xmm5, %xmm4, %k1 +; X86-64-NEXT: .LBB0_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_fcmp_storefloat: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB0_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1 +; X86-32-NEXT: jmp .LBB0_3 +; X86-32-NEXT: .LBB0_2: # %else +; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1 +; X86-32-NEXT: .LBB0_3: # %exit +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -34,20 +54,38 @@ exit: } define void @test_fcmp_storei1(i1 %cond, float* %fptr, i1* %iptr, float %f1, float %f2, float %f3, float %f4) { -; CHECK-LABEL: test_fcmp_storei1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB1_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; CHECK-NEXT: jmp .LBB1_3 -; CHECK-NEXT: .LBB1_2: # %else -; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k0 -; CHECK-NEXT: .LBB1_3: # %exit -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movb %al, (%rdx) -; CHECK-NEXT: retq +; X86-64-LABEL: test_fcmp_storei1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB1_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; X86-64-NEXT: jmp .LBB1_3 +; X86-64-NEXT: .LBB1_2: # %else +; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k0 +; X86-64-NEXT: .LBB1_3: # %exit +; X86-64-NEXT: kmovd %k0, %eax +; X86-64-NEXT: andb $1, %al +; X86-64-NEXT: movb %al, (%rdx) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_fcmp_storei1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB1_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 +; X86-32-NEXT: jmp .LBB1_3 +; X86-32-NEXT: .LBB1_2: # %else +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 +; X86-32-NEXT: .LBB1_3: # %exit +; X86-32-NEXT: kmovd %k0, %ecx +; X86-32-NEXT: andb $1, %cl +; X86-32-NEXT: movb %cl, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -66,21 +104,42 @@ exit: } define void @test_load_add(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) { -; CHECK-LABEL: test_load_add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB2_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rdx), %k0 -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: kaddb %k1, %k0, %k1 -; CHECK-NEXT: jmp .LBB2_3 -; CHECK-NEXT: .LBB2_2: # %else -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: .LBB2_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_load_add: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB2_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rdx), %k0 +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: kaddb %k1, %k0, %k1 +; X86-64-NEXT: jmp .LBB2_3 +; X86-64-NEXT: .LBB2_2: # %else +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: .LBB2_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_load_add: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB2_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-32-NEXT: kmovb (%edx), %k0 +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: kaddb %k1, %k0, %k1 +; X86-32-NEXT: jmp .LBB2_3 +; X86-32-NEXT: .LBB2_2: # %else +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB2_3: # %exit +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -102,19 +161,37 @@ exit: } define void @test_load_i1(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) { -; CHECK-LABEL: test_load_i1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB3_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: jmp .LBB3_3 -; CHECK-NEXT: .LBB3_2: # %else -; CHECK-NEXT: kmovb (%rcx), %k1 -; CHECK-NEXT: .LBB3_3: # %exit -; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovss %xmm1, (%rsi) -; CHECK-NEXT: retq +; X86-64-LABEL: test_load_i1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB3_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: jmp .LBB3_3 +; X86-64-NEXT: .LBB3_2: # %else +; X86-64-NEXT: kmovb (%rcx), %k1 +; X86-64-NEXT: .LBB3_3: # %exit +; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; X86-64-NEXT: vmovss %xmm1, (%rsi) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_load_i1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB3_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: jmp .LBB3_3 +; X86-32-NEXT: .LBB3_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: .LBB3_3: # %exit +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X86-32-NEXT: vmovss %xmm0, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -134,19 +211,35 @@ exit: } define void @test_loadi1_storei1(i1 %cond, i1* %iptr1, i1* %iptr2, i1* %iptr3) { -; CHECK-LABEL: test_loadi1_storei1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB4_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: jmp .LBB4_3 -; CHECK-NEXT: .LBB4_2: # %else -; CHECK-NEXT: movb (%rdx), %al -; CHECK-NEXT: .LBB4_3: # %exit -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movb %al, (%rcx) -; CHECK-NEXT: retq +; X86-64-LABEL: test_loadi1_storei1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB4_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: movb (%rsi), %al +; X86-64-NEXT: jmp .LBB4_3 +; X86-64-NEXT: .LBB4_2: # %else +; X86-64-NEXT: movb (%rdx), %al +; X86-64-NEXT: .LBB4_3: # %exit +; X86-64-NEXT: andb $1, %al +; X86-64-NEXT: movb %al, (%rcx) +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_loadi1_storei1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB4_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: jmp .LBB4_3 +; X86-32-NEXT: .LBB4_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: .LBB4_3: # %exit +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: andb $1, %cl +; X86-32-NEXT: movb %cl, (%eax) +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -165,23 +258,44 @@ exit: } define void @test_shl1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shl1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB5_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kaddb %k0, %k0, %k1 -; CHECK-NEXT: jmp .LBB5_3 -; CHECK-NEXT: .LBB5_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB5_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shl1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB5_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kaddb %k0, %k0, %k1 +; X86-64-NEXT: jmp .LBB5_3 +; X86-64-NEXT: .LBB5_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB5_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shl1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB5_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kaddb %k0, %k0, %k1 +; X86-32-NEXT: jmp .LBB5_3 +; X86-32-NEXT: .LBB5_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB5_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -203,24 +317,46 @@ exit: } define void @test_shr1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shr1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB6_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: shrb %al -; CHECK-NEXT: jmp .LBB6_3 -; CHECK-NEXT: .LBB6_2: # %else -; CHECK-NEXT: movb (%rdx), %al -; CHECK-NEXT: .LBB6_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shr1: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB6_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: movb (%rsi), %al +; X86-64-NEXT: shrb %al +; X86-64-NEXT: jmp .LBB6_3 +; X86-64-NEXT: .LBB6_2: # %else +; X86-64-NEXT: movb (%rdx), %al +; X86-64-NEXT: .LBB6_3: # %exit +; X86-64-NEXT: kmovd %eax, %k1 +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shr1: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB6_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: shrb %cl +; X86-32-NEXT: jmp .LBB6_3 +; X86-32-NEXT: .LBB6_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movb (%ecx), %cl +; X86-32-NEXT: .LBB6_3: # %exit +; X86-32-NEXT: kmovd %ecx, %k1 +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -242,23 +378,44 @@ exit: } define void @test_shr2(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shr2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB7_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kshiftrb $2, %k0, %k1 -; CHECK-NEXT: jmp .LBB7_3 -; CHECK-NEXT: .LBB7_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB7_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shr2: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB7_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kshiftrb $2, %k0, %k1 +; X86-64-NEXT: jmp .LBB7_3 +; X86-64-NEXT: .LBB7_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB7_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shr2: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB7_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kshiftrb $2, %k0, %k1 +; X86-32-NEXT: jmp .LBB7_3 +; X86-32-NEXT: .LBB7_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB7_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -280,23 +437,44 @@ exit: } define void @test_shl(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_shl: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB8_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kshiftlb $6, %k0, %k1 -; CHECK-NEXT: jmp .LBB8_3 -; CHECK-NEXT: .LBB8_2: # %else -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: .LBB8_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_shl: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB8_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kshiftlb $6, %k0, %k1 +; X86-64-NEXT: jmp .LBB8_3 +; X86-64-NEXT: .LBB8_2: # %else +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: .LBB8_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_shl: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB8_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k0 +; X86-32-NEXT: kshiftlb $6, %k0, %k1 +; X86-32-NEXT: jmp .LBB8_3 +; X86-32-NEXT: .LBB8_2: # %else +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: .LBB8_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else @@ -318,24 +496,46 @@ exit: } define void @test_add(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) { -; CHECK-LABEL: test_add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: %ymm1 %ymm1 %zmm1 -; CHECK-NEXT: # kill: %ymm0 %ymm0 %zmm0 -; CHECK-NEXT: kmovb (%rsi), %k0 -; CHECK-NEXT: kmovb (%rdx), %k1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB9_2 -; CHECK-NEXT: # %bb.1: # %if -; CHECK-NEXT: kandb %k1, %k0, %k1 -; CHECK-NEXT: jmp .LBB9_3 -; CHECK-NEXT: .LBB9_2: # %else -; CHECK-NEXT: kaddb %k1, %k0, %k1 -; CHECK-NEXT: .LBB9_3: # %exit -; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %ymm1, (%rcx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X86-64-LABEL: test_add: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-64-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-64-NEXT: kmovb (%rsi), %k0 +; X86-64-NEXT: kmovb (%rdx), %k1 +; X86-64-NEXT: testb $1, %dil +; X86-64-NEXT: je .LBB9_2 +; X86-64-NEXT: # %bb.1: # %if +; X86-64-NEXT: kandb %k1, %k0, %k1 +; X86-64-NEXT: jmp .LBB9_3 +; X86-64-NEXT: .LBB9_2: # %else +; X86-64-NEXT: kaddb %k1, %k0, %k1 +; X86-64-NEXT: .LBB9_3: # %exit +; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vzeroupper +; X86-64-NEXT: retq +; +; X86-32-LABEL: test_add: +; X86-32: # %bb.0: # %entry +; X86-32-NEXT: # kill: %ymm1 %ymm1 %zmm1 +; X86-32-NEXT: # kill: %ymm0 %ymm0 %zmm0 +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-32-NEXT: kmovb (%edx), %k0 +; X86-32-NEXT: kmovb (%ecx), %k1 +; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-32-NEXT: je .LBB9_2 +; X86-32-NEXT: # %bb.1: # %if +; X86-32-NEXT: kandb %k1, %k0, %k1 +; X86-32-NEXT: jmp .LBB9_3 +; X86-32-NEXT: .LBB9_2: # %else +; X86-32-NEXT: kaddb %k1, %k0, %k1 +; X86-32-NEXT: .LBB9_3: # %exit +; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vzeroupper +; X86-32-NEXT: retl entry: %loaded1 = load i8, i8* %ptr1 %loaded2 = load i8, i8* %ptr2 diff --git a/test/CodeGen/X86/pr34605.ll b/test/CodeGen/X86/pr34605.ll index 8c25b068ecf88..19fed5db5bc52 100644 --- a/test/CodeGen/X86/pr34605.ll +++ b/test/CodeGen/X86/pr34605.ll @@ -19,15 +19,15 @@ define void @pr34605(i8* nocapture %s, i32 %p) { ; CHECK-NEXT: kunpckdq %k2, %k1, %k1 ; CHECK-NEXT: kandq %k1, %k0, %k1 ; CHECK-NEXT: vmovdqu8 {{\.LCPI.*}}, %zmm0 {%k1} {z} +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmovdqu32 %zmm0, (%eax) -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu32 %zmm0, 64(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 128(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 192(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 256(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 320(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 384(%eax) -; CHECK-NEXT: vmovdqu32 %zmm0, 448(%eax) +; CHECK-NEXT: vmovups %zmm1, 64(%eax) +; CHECK-NEXT: vmovups %zmm1, 128(%eax) +; CHECK-NEXT: vmovups %zmm1, 192(%eax) +; CHECK-NEXT: vmovups %zmm1, 256(%eax) +; CHECK-NEXT: vmovups %zmm1, 320(%eax) +; CHECK-NEXT: vmovups %zmm1, 384(%eax) +; CHECK-NEXT: vmovups %zmm1, 448(%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl entry: diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 67e497aee0b36..c422862ac1463 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -29,8 +29,9 @@ define <8 x i16> @pr25080(<8 x i32> %a) { ; KNL-32-NEXT: movb $15, %al ; KNL-32-NEXT: kmovw %eax, %k1 ; KNL-32-NEXT: korw %k1, %k0, %k1 -; KNL-32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-32-NEXT: vpmovqw %zmm0, %xmm0 +; KNL-32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-32-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; KNL-32-NEXT: retl entry: %0 = trunc <8 x i32> %a to <8 x i23> diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll index 6712c0dccc556..0b6c205fd26b0 100644 --- a/test/CodeGen/X86/sse-fsignum.ll +++ b/test/CodeGen/X86/sse-fsignum.ll @@ -93,12 +93,10 @@ define void @signum32b(<8 x float>*) { ; AVX512F-NEXT: vmovaps (%rdi), %ymm0 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) diff --git a/test/CodeGen/X86/stack-protector-msvc.ll b/test/CodeGen/X86/stack-protector-msvc.ll index 5eccc65f2dec2..c1f79f9db2f6f 100644 --- a/test/CodeGen/X86/stack-protector-msvc.ll +++ b/test/CodeGen/X86/stack-protector-msvc.ll @@ -1,19 +1,9 @@ +; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X86 %s +; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X64 %s -; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s -; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-64 %s - -; MSVC-I386: movl ___security_cookie, %[[REG1:[a-z]*]] -; MSVC-I386: movl %[[REG1]], [[SLOT:[0-9]*]](%esp) -; MSVC-I386: calll _strcpy -; MSVC-I386: movl [[SLOT]](%esp), %ecx -; MSVC-I386: calll @__security_check_cookie@4 -; MSVC-I386: retl - -; MSVC-64: movq __security_cookie(%rip), %[[REG1:[a-z]*]] -; MSVC-64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp) -; MSVC-64: callq strcpy -; MSVC-64: movq [[SLOT]](%rsp), %rcx -; MSVC-64: callq __security_check_cookie +; Make sure fastisel falls back and does something secure. +; RUN: llc -mtriple=i686-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X86-O0 %s +; RUN: llc -mtriple=x86_64-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X64-O0 %s @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1] @@ -21,7 +11,6 @@ define void @test(i8* %a) nounwind ssp { entry: %a_addr = alloca i8* ; [#uses=2] %buf = alloca [8 x i8] ; <[8 x i8]*> [#uses=2] - %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] store i8* %a, i8** %a_addr %buf1 = bitcast [8 x i8]* %buf to i8* ; [#uses=1] %0 = load i8*, i8** %a_addr, align 4 ; [#uses=1] @@ -34,6 +23,139 @@ return: ; preds = %entry ret void } +; MSVC-X86-LABEL: _test: +; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]] +; MSVC-X86: xorl %esp, %[[REG1]] +; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esp) +; MSVC-X86: calll _strcpy +; MSVC-X86: movl [[SLOT]](%esp), %ecx +; MSVC-X86: xorl %esp, %ecx +; MSVC-X86: calll @__security_check_cookie@4 +; MSVC-X86: retl + +; MSVC-X64-LABEL: test: +; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]] +; MSVC-X64: xorq %rsp, %[[REG1]] +; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp) +; MSVC-X64: callq strcpy +; MSVC-X64: movq [[SLOT]](%rsp), %rcx +; MSVC-X64: xorq %rsp, %rcx +; MSVC-X64: callq __security_check_cookie +; MSVC-X64: retq + +; MSVC-X86-O0-LABEL: _test: +; MSVC-X86-O0: movl ___security_cookie, %[[REG1:[^ ]*]] +; MSVC-X86-O0: xorl %esp, %[[REG1]] +; MSVC-X86-O0: movl %[[REG1]], [[SLOT:[0-9]*]](%esp) +; MSVC-X86-O0: calll _strcpy +; MSVC-X86-O0: movl [[SLOT]](%esp), %[[REG1:[^ ]*]] +; MSVC-X86-O0: xorl %esp, %[[REG1]] +; MSVC-X86-O0: movl %[[REG1]], %ecx +; MSVC-X86-O0: calll @__security_check_cookie@4 +; MSVC-X86-O0: retl + +; MSVC-X64-O0-LABEL: test: +; MSVC-X64-O0: movq __security_cookie(%rip), %[[REG1:[^ ]*]] +; MSVC-X64-O0: xorq %rsp, %[[REG1]] +; MSVC-X64-O0: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp) +; MSVC-X64-O0: callq strcpy +; MSVC-X64-O0: movq [[SLOT]](%rsp), %[[REG1:[^ ]*]] +; MSVC-X64-O0: xorq %rsp, %[[REG1]] +; MSVC-X64-O0: movq %[[REG1]], %rcx +; MSVC-X64-O0: callq __security_check_cookie +; MSVC-X64-O0: retq + + +declare void @escape(i32*) + +define void @test_vla(i32 %n) nounwind ssp { + %vla = alloca i32, i32 %n + call void @escape(i32* %vla) + ret void +} + +; MSVC-X86-LABEL: _test_vla: +; MSVC-X86: pushl %ebp +; MSVC-X86: movl %esp, %ebp +; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]] +; MSVC-X86: xorl %ebp, %[[REG1]] +; MSVC-X86: movl %[[REG1]], [[SLOT:-[0-9]*]](%ebp) +; MSVC-X86: calll __chkstk +; MSVC-X86: pushl +; MSVC-X86: calll _escape +; MSVC-X86: movl [[SLOT]](%ebp), %ecx +; MSVC-X86: xorl %ebp, %ecx +; MSVC-X86: calll @__security_check_cookie@4 +; MSVC-X86: movl %ebp, %esp +; MSVC-X86: popl %ebp +; MSVC-X86: retl + +; MSVC-X64-LABEL: test_vla: +; MSVC-X64: pushq %rbp +; MSVC-X64: subq $16, %rsp +; MSVC-X64: leaq 16(%rsp), %rbp +; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]] +; MSVC-X64: xorq %rbp, %[[REG1]] +; MSVC-X64: movq %[[REG1]], [[SLOT:-[0-9]*]](%rbp) +; MSVC-X64: callq __chkstk +; MSVC-X64: callq escape +; MSVC-X64: movq [[SLOT]](%rbp), %rcx +; MSVC-X64: xorq %rbp, %rcx +; MSVC-X64: callq __security_check_cookie +; MSVC-X64: retq + + +; This case is interesting because we address local variables with RBX but XOR +; the guard value with RBP. That's fine, either value will do, as long as they +; are the same across the life of the frame. + +define void @test_vla_realign(i32 %n) nounwind ssp { + %realign = alloca i32, align 32 + %vla = alloca i32, i32 %n + call void @escape(i32* %realign) + call void @escape(i32* %vla) + ret void +} + +; MSVC-X86-LABEL: _test_vla_realign: +; MSVC-X86: pushl %ebp +; MSVC-X86: movl %esp, %ebp +; MSVC-X86: pushl %esi +; MSVC-X86: andl $-32, %esp +; MSVC-X86: subl $32, %esp +; MSVC-X86: movl %esp, %esi +; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]] +; MSVC-X86: xorl %ebp, %[[REG1]] +; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esi) +; MSVC-X86: calll __chkstk +; MSVC-X86: pushl +; MSVC-X86: calll _escape +; MSVC-X86: movl [[SLOT]](%esi), %ecx +; MSVC-X86: xorl %ebp, %ecx +; MSVC-X86: calll @__security_check_cookie@4 +; MSVC-X86: leal -8(%ebp), %esp +; MSVC-X86: popl %esi +; MSVC-X86: popl %ebp +; MSVC-X86: retl + +; MSVC-X64-LABEL: test_vla_realign: +; MSVC-X64: pushq %rbp +; MSVC-X64: pushq %rbx +; MSVC-X64: subq $32, %rsp +; MSVC-X64: leaq 32(%rsp), %rbp +; MSVC-X64: andq $-32, %rsp +; MSVC-X64: movq %rsp, %rbx +; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]] +; MSVC-X64: xorq %rbp, %[[REG1]] +; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rbx) +; MSVC-X64: callq __chkstk +; MSVC-X64: callq escape +; MSVC-X64: movq [[SLOT]](%rbx), %rcx +; MSVC-X64: xorq %rbp, %rcx +; MSVC-X64: callq __security_check_cookie +; MSVC-X64: retq + + declare i8* @strcpy(i8*, i8*) nounwind declare i32 @printf(i8*, ...) nounwind diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll index de40d30cc4826..3708d216f8df0 100644 --- a/test/CodeGen/X86/stack-protector-weight.ll +++ b/test/CodeGen/X86/stack-protector-weight.ll @@ -21,10 +21,11 @@ ; MSVC-SELDAG: LD4[FixedStack0] ; MSVC-SELDAG: CALLpcrel32 +; MSVC always uses selection DAG now. ; MSVC-IR: # Machine code for function test_branch_weights: ; MSVC-IR: mem:Volatile LD4[@__security_cookie] ; MSVC-IR: ST4[FixedStack0] -; MSVC-IR: LD4[%StackGuardSlot] +; MSVC-IR: LD4[FixedStack0] ; MSVC-IR: CALLpcrel32 define i32 @test_branch_weights(i32 %n) #0 { diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 0eb3a64311a76..51b5219f30243 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -8,6 +8,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 +; Make sure we don't crash with avx512bw and xop +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw + define i8 @test_bitreverse_i8(i8 %a) nounwind { ; SSE-LABEL: test_bitreverse_i8: ; SSE: # %bb.0: diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 5ceb4b1cb88a2..6ac0c7b3d33f9 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -615,16 +615,18 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v8f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -792,16 +794,18 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v8i64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 88fc588d27e97..24444ecdc1948 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1435,9 +1435,8 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1445,9 +1444,8 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %xmm0 %xmm0 %ymm0 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: %xmm0 %xmm0 %zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1999,8 +1997,9 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2402,16 +2401,16 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_sext_8i1_to_8i32: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: %ymm0 %ymm0 %zmm0 ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_8i1_to_8i32: diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index eba442e3138b5..b3219a330586e 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -114,8 +114,9 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -240,8 +241,9 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll index 767f70d1190fe..7861eead9e2a0 100644 --- a/test/CodeGen/X86/x87-schedule.ll +++ b/test/CodeGen/X86/x87-schedule.ll @@ -596,28 +596,28 @@ define void @test_fcmov() optsize { ; ATOM-LABEL: test_fcmov: ; ATOM: # %bb.0: ; ATOM-NEXT: #APP -; ATOM-NEXT: fcmovb %st(1), %st(0) -; ATOM-NEXT: fcmovbe %st(1), %st(0) -; ATOM-NEXT: fcmove %st(1), %st(0) -; ATOM-NEXT: fcmovnb %st(1), %st(0) -; ATOM-NEXT: fcmovnbe %st(1), %st(0) -; ATOM-NEXT: fcmovne %st(1), %st(0) -; ATOM-NEXT: fcmovnu %st(1), %st(0) -; ATOM-NEXT: fcmovu %st(1), %st(0) +; ATOM-NEXT: fcmovb %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovbe %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmove %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovnb %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovnbe %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovne %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovnu %st(1), %st(0) # sched: [9:4.50] +; ATOM-NEXT: fcmovu %st(1), %st(0) # sched: [9:4.50] ; ATOM-NEXT: #NO_APP ; ATOM-NEXT: retl # sched: [79:39.50] ; ; SLM-LABEL: test_fcmov: ; SLM: # %bb.0: ; SLM-NEXT: #APP -; SLM-NEXT: fcmovb %st(1), %st(0) -; SLM-NEXT: fcmovbe %st(1), %st(0) -; SLM-NEXT: fcmove %st(1), %st(0) -; SLM-NEXT: fcmovnb %st(1), %st(0) -; SLM-NEXT: fcmovnbe %st(1), %st(0) -; SLM-NEXT: fcmovne %st(1), %st(0) -; SLM-NEXT: fcmovnu %st(1), %st(0) -; SLM-NEXT: fcmovu %st(1), %st(0) +; SLM-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; SLM-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; SLM-NEXT: #NO_APP ; SLM-NEXT: retl # sched: [4:1.00] ; @@ -638,70 +638,70 @@ define void @test_fcmov() optsize { ; HASWELL-LABEL: test_fcmov: ; HASWELL: # %bb.0: ; HASWELL-NEXT: #APP -; HASWELL-NEXT: fcmovb %st(1), %st(0) -; HASWELL-NEXT: fcmovbe %st(1), %st(0) -; HASWELL-NEXT: fcmove %st(1), %st(0) -; HASWELL-NEXT: fcmovnb %st(1), %st(0) -; HASWELL-NEXT: fcmovnbe %st(1), %st(0) -; HASWELL-NEXT: fcmovne %st(1), %st(0) -; HASWELL-NEXT: fcmovnu %st(1), %st(0) -; HASWELL-NEXT: fcmovu %st(1), %st(0) +; HASWELL-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; HASWELL-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; HASWELL-NEXT: #NO_APP ; HASWELL-NEXT: retl # sched: [5:0.50] ; ; BROADWELL-LABEL: test_fcmov: ; BROADWELL: # %bb.0: ; BROADWELL-NEXT: #APP -; BROADWELL-NEXT: fcmovb %st(1), %st(0) -; BROADWELL-NEXT: fcmovbe %st(1), %st(0) -; BROADWELL-NEXT: fcmove %st(1), %st(0) -; BROADWELL-NEXT: fcmovnb %st(1), %st(0) -; BROADWELL-NEXT: fcmovnbe %st(1), %st(0) -; BROADWELL-NEXT: fcmovne %st(1), %st(0) -; BROADWELL-NEXT: fcmovnu %st(1), %st(0) -; BROADWELL-NEXT: fcmovu %st(1), %st(0) +; BROADWELL-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; BROADWELL-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; BROADWELL-NEXT: #NO_APP ; BROADWELL-NEXT: retl # sched: [6:0.50] ; ; SKYLAKE-LABEL: test_fcmov: ; SKYLAKE: # %bb.0: ; SKYLAKE-NEXT: #APP -; SKYLAKE-NEXT: fcmovb %st(1), %st(0) -; SKYLAKE-NEXT: fcmovbe %st(1), %st(0) -; SKYLAKE-NEXT: fcmove %st(1), %st(0) -; SKYLAKE-NEXT: fcmovnb %st(1), %st(0) -; SKYLAKE-NEXT: fcmovnbe %st(1), %st(0) -; SKYLAKE-NEXT: fcmovne %st(1), %st(0) -; SKYLAKE-NEXT: fcmovnu %st(1), %st(0) -; SKYLAKE-NEXT: fcmovu %st(1), %st(0) +; SKYLAKE-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; SKYLAKE-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; SKYLAKE-NEXT: #NO_APP ; SKYLAKE-NEXT: retl # sched: [6:0.50] ; ; SKX-LABEL: test_fcmov: ; SKX: # %bb.0: ; SKX-NEXT: #APP -; SKX-NEXT: fcmovb %st(1), %st(0) -; SKX-NEXT: fcmovbe %st(1), %st(0) -; SKX-NEXT: fcmove %st(1), %st(0) -; SKX-NEXT: fcmovnb %st(1), %st(0) -; SKX-NEXT: fcmovnbe %st(1), %st(0) -; SKX-NEXT: fcmovne %st(1), %st(0) -; SKX-NEXT: fcmovnu %st(1), %st(0) -; SKX-NEXT: fcmovu %st(1), %st(0) +; SKX-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; SKX-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; SKX-NEXT: #NO_APP ; SKX-NEXT: retl # sched: [6:0.50] ; ; BTVER2-LABEL: test_fcmov: ; BTVER2: # %bb.0: ; BTVER2-NEXT: #APP -; BTVER2-NEXT: fcmovb %st(1), %st(0) -; BTVER2-NEXT: fcmovbe %st(1), %st(0) -; BTVER2-NEXT: fcmove %st(1), %st(0) -; BTVER2-NEXT: fcmovnb %st(1), %st(0) -; BTVER2-NEXT: fcmovnbe %st(1), %st(0) -; BTVER2-NEXT: fcmovne %st(1), %st(0) -; BTVER2-NEXT: fcmovnu %st(1), %st(0) -; BTVER2-NEXT: fcmovu %st(1), %st(0) +; BTVER2-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00] +; BTVER2-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: retl # sched: [4:1.00] ; @@ -710,7 +710,7 @@ define void @test_fcmov() optsize { ; ZNVER1-NEXT: #APP ; ZNVER1-NEXT: fcmovb %st(1), %st(0) # sched: [100:?] ; ZNVER1-NEXT: fcmovbe %st(1), %st(0) # sched: [100:?] -; ZNVER1-NEXT: fcmove %st(1), %st(0) +; ZNVER1-NEXT: fcmove %st(1), %st(0) # sched: [100:?] ; ZNVER1-NEXT: fcmovnb %st(1), %st(0) # sched: [100:?] ; ZNVER1-NEXT: fcmovnbe %st(1), %st(0) # sched: [100:?] ; ZNVER1-NEXT: fcmovne %st(1), %st(0) # sched: [100:?] diff --git a/test/MC/ARM/dwarf-asm-multiple-sections.s b/test/MC/ARM/dwarf-asm-multiple-sections.s index 1ae0bdf33b1ac..3f2e9b8efa303 100644 --- a/test/MC/ARM/dwarf-asm-multiple-sections.s +++ b/test/MC/ARM/dwarf-asm-multiple-sections.s @@ -1,14 +1,14 @@ // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 5 -fdebug-compilation-dir=/tmp // RUN: llvm-dwarfdump -v %t | FileCheck -check-prefix DWARF -check-prefix DWARF45 %s -// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL %s +// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL -check-prefix DWARF-DL-5 -DDWVER=5 %s // RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC -check-prefix RELOC5 %s // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp // RUN: llvm-dwarfdump -v %t | FileCheck -check-prefix DWARF -check-prefix DWARF45 %s -// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL %s +// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL -DDWVER=4 %s // RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC -check-prefix RELOC4 %s // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 3 -fdebug-compilation-dir=/tmp // RUN: llvm-dwarfdump -v %t | FileCheck -check-prefix DWARF -check-prefix DWARF3 %s -// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL %s +// RUN: llvm-dwarfdump --debug-line %t | FileCheck -check-prefix DWARF-DL -DDWVER=3 %s // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 2 2>&1 | FileCheck -check-prefix VERSION %s // RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 1 2>&1 | FileCheck -check-prefix DWARF1 %s // RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 6 2>&1 | FileCheck -check-prefix DWARF6 %s @@ -52,6 +52,10 @@ b: // DWARF-DL: .debug_line contents: +// DWARF-DL: version: [[DWVER]] +// DWARF-DL-5: address_size: 4 +// DWARF-DL-5: include_directories[ 1] = '' +// DWARF-DL: file_names[ 1] {{.*}} // DWARF-DL: 0x0000000000000000 17 0 1 0 0 is_stmt // DWARF-DL-NEXT: 0x0000000000000004 17 0 1 0 0 is_stmt end_sequence // DWARF-DL-NEXT: 0x0000000000000000 21 0 1 0 0 is_stmt diff --git a/test/SafepointIRVerifier/use-derived-unrelocated.ll b/test/SafepointIRVerifier/use-derived-unrelocated.ll new file mode 100644 index 0000000000000..d2d7382554474 --- /dev/null +++ b/test/SafepointIRVerifier/use-derived-unrelocated.ll @@ -0,0 +1,149 @@ +; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s + +; Checking if verifier accepts chain of GEPs/bitcasts. +define void @test.deriving.ok(i32, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.deriving.ok +; CHECK-NEXT: No illegal uses found by SafepointIRVerifier in: test.deriving.ok + %ptr = getelementptr i8, i8 addrspace(1)* %base1, i64 4 + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base1) + %ptr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 + %ptr.i32 = bitcast i8 addrspace(1)* %ptr to i32 addrspace(1)* + %ptr2.i32 = bitcast i8 addrspace(1)* %ptr2 to i32 addrspace(1)* + ret void +} + +; Checking if verifier accepts cmp of two derived pointers when one defined +; before safepoint and one after and both have unrelocated base. +define void @test.cmp.ok(i32, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.cmp.ok +; CHECK-NEXT: No illegal uses found by SafepointIRVerifier in: test.cmp.ok + %ptr = getelementptr i8, i8 addrspace(1)* %base1, i64 4 + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base1) + %ptr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 + %c2 = icmp sgt i8 addrspace(1)* %ptr2, %ptr + ret void +} + +; Checking if verifier accepts cmp of two derived pointers when one defined +; before safepoint and one after and both have unrelocated base. One of pointers +; defined as a long chain of geps/bitcasts. +define void @test.cmp-long_chain.ok(i32, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.cmp-long_chain.ok +; CHECK-NEXT: No illegal uses found by SafepointIRVerifier in: test.cmp-long_chain.ok + %ptr = getelementptr i8, i8 addrspace(1)* %base1, i64 4 + %ptr.i32 = bitcast i8 addrspace(1)* %ptr to i32 addrspace(1)* + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base1) + %ptr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 + %ptr2.i32 = bitcast i8 addrspace(1)* %ptr2 to i32 addrspace(1)* + %ptr2.i32.2 = getelementptr i32, i32 addrspace(1)* %ptr2.i32, i64 4 + %ptr2.i32.3 = getelementptr i32, i32 addrspace(1)* %ptr2.i32.2, i64 8 + %ptr2.i32.4 = getelementptr i32, i32 addrspace(1)* %ptr2.i32.3, i64 8 + %ptr2.i32.5 = getelementptr i32, i32 addrspace(1)* %ptr2.i32.4, i64 8 + %ptr2.i32.6 = getelementptr i32, i32 addrspace(1)* %ptr2.i32.5, i64 8 + %ptr2.i32.6.i8 = bitcast i32 addrspace(1)* %ptr2.i32.6 to i8 addrspace(1)* + %ptr2.i32.6.i8.i32 = bitcast i8 addrspace(1)* %ptr2.i32.6.i8 to i32 addrspace(1)* + %ptr2.i32.6.i8.i32.2 = getelementptr i32, i32 addrspace(1)* %ptr2.i32.6.i8.i32, i64 8 + %c2 = icmp sgt i32 addrspace(1)* %ptr2.i32.6.i8.i32.2, %ptr.i32 + ret void +} + +; GEP and bitcast of unrelocated pointer is acceptable, but load by resulting +; pointer should be reported. +define void @test.load.fail(i32, i8 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.load.fail + %ptr = getelementptr i8, i8 addrspace(1)* %base, i64 4 + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base) + %ptr.i32 = bitcast i8 addrspace(1)* %ptr to i32 addrspace(1)* ; it's ok +; CHECK-NEXT: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %ptr.i32 = bitcast i8 addrspace(1)* %ptr to i32 addrspace(1)* +; CHECK-NEXT: Use: %ptr.val = load i32, i32 addrspace(1)* %ptr.i32 + %ptr.val = load i32, i32 addrspace(1)* %ptr.i32 + ret void +} + +; Comparison between pointer derived from unrelocated one (though defined after +; safepoint) and relocated pointer should be reported. +define void @test.cmp.fail(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.cmp.fail + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + %base2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2 + %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg +; CHECK-NEXT: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg +; CHECK-NEXT: Use: %cmp = icmp eq i8 addrspace(1)* %addr1, %base2.relocated + %cmp = icmp eq i8 addrspace(1)* %addr1, %base2.relocated + ret void +} + +; Same as test.cmp.fail but splitted into two BBs. +define void @test.cmp2.fail(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +.b0: +; CHECK-LABEL: Verifying gc pointers in function: test.cmp2.fail + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + %base2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2 + %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + br label %.b1 + +.b1: +; CHECK-NEXT: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg +; CHECK-NEXT: Use: %cmp = icmp eq i8 addrspace(1)* %addr1, %base2.relocated + %cmp = icmp eq i8 addrspace(1)* %addr1, %base2.relocated + ret void +} + +; Checking that cmp of two unrelocated pointers is OK and load is not. +define void @test.cmp-load.fail(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.cmp-load.fail + %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + %addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 + %cmp = icmp eq i8 addrspace(1)* %addr1, %addr2 +; CHECK-NEXT: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 +; CHECK-NEXT: Use: %val = load i8, i8 addrspace(1)* %addr2 + %val = load i8, i8 addrspace(1)* %addr2 + ret void +} + +; Same as test.cmp-load.fail but splitted into thee BBs. +define void @test.cmp-load2.fail(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +.b0: +; CHECK-LABEL: Verifying gc pointers in function: test.cmp-load2.fail + %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + br label %.b1 + +.b1: + %addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 + br label %.b2 + +.b2: + %cmp = icmp eq i8 addrspace(1)* %addr1, %addr2 +; CHECK-NEXT: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 8 +; CHECK-NEXT: Use: %val = load i8, i8 addrspace(1)* %addr2 + %val = load i8, i8 addrspace(1)* %addr2 + ret void +} + +; Same as test.cmp.ok but with multiple safepoints within one BB. And the last +; one is in the very end of BB so that Contribution of this BB is empty. +define void @test.cmp.multi-sp.ok(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.cmp.multi-sp.ok +; CHECK-NEXT: No illegal uses found by SafepointIRVerifier in: test.cmp.multi-sp.ok + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + %base2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2 + %addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + %safepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2.relocated, i32 -1, i32 0, i32 0, i32 0) + %base2.relocated2 = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token2, i32 7, i32 7) ; base2.relocated, base2.relocated + %addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 %arg + %cmp = icmp eq i8 addrspace(1)* %addr1, %addr2 + %safepoint_token3 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2.relocated2, i32 -1, i32 0, i32 0, i32 0) + ret void +} + +; Function Attrs: nounwind +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) + diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td index 8bd31074af4ee..bdfda0a3be95f 100644 --- a/test/TableGen/GlobalISelEmitter.td +++ b/test/TableGen/GlobalISelEmitter.td @@ -832,6 +832,7 @@ def MOVfpimmz : I<(outs FPR32:$dst), (ins f32imm:$imm), [(set FPR32:$dst, fpimmz // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 22*/ [[LABEL:[0-9]+]], // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_LOAD, +// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(int64_t)AtomicOrdering::NotAtomic, // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, // CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, @@ -860,6 +861,7 @@ def LOAD : I<(outs GPR32:$dst), (ins GPR32:$src1), // CHECK-NEXT: // MIs[0] Operand 1 // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s16, // CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_LOAD, +// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/1, /*Order*/(int64_t)AtomicOrdering::NotAtomic, // CHECK-NEXT: // MIs[1] Operand 0 // CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s16, // CHECK-NEXT: // MIs[1] src1 diff --git a/test/Transforms/CorrelatedValuePropagation/overflows.ll b/test/Transforms/CorrelatedValuePropagation/overflows.ll index 5cd6b261be441..a131038b8e0da 100644 --- a/test/Transforms/CorrelatedValuePropagation/overflows.ll +++ b/test/Transforms/CorrelatedValuePropagation/overflows.ll @@ -13,8 +13,7 @@ declare void @llvm.trap() define i32 @signed_add(i32 %x, i32 %y) { ; CHECK-LABEL: @signed_add( -; CHECK: @llvm.ssub.with.overflow.i32 -; CHECK: @llvm.ssub.with.overflow.i32 +; CHECK-NOT: @llvm.ssub.with.overflow.i32 ; CHECK: @llvm.sadd.with.overflow.i32 entry: %cmp = icmp sgt i32 %y, 0 @@ -61,7 +60,7 @@ cond.end: ; preds = %cond.false, %cont, define i32 @unsigned_add(i32 %x, i32 %y) { ; CHECK-LABEL: @unsigned_add( -; CHECK: @llvm.usub.with.overflow.i32 +; CHECK-NOT: @llvm.usub.with.overflow.i32 ; CHECK: @llvm.uadd.with.overflow.i32 entry: %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 -1, i32 %y) @@ -203,7 +202,7 @@ cond.end: ; preds = %cond.false, %entry define i32 @signed_sub_r1(i32 %x) { ; CHECK-LABEL: @signed_sub_r1( -; CHECK: @llvm.ssub.with.overflow.i32 +; CHECK-NOT: @llvm.ssub.with.overflow.i32 entry: %cmp = icmp eq i32 %x, -2147483648 br i1 %cmp, label %cond.end, label %cond.false @@ -225,7 +224,7 @@ cond.end: ; preds = %cond.false, %entry define i32 @unsigned_sub_r1(i32 %x) { ; CHECK-LABEL: @unsigned_sub_r1( -; CHECK: @llvm.usub.with.overflow.i32 +; CHECK-NOT: @llvm.usub.with.overflow.i32 entry: %cmp = icmp eq i32 %x, 0 br i1 %cmp, label %cond.end, label %cond.false @@ -269,7 +268,7 @@ cond.end: ; preds = %cond.false, %entry define i32 @signed_sub_rn1(i32 %x) { ; CHECK-LABEL: @signed_sub_rn1( -; CHECK: @llvm.ssub.with.overflow.i32 +; CHECK-NOT: @llvm.ssub.with.overflow.i32 entry: %cmp = icmp eq i32 %x, 2147483647 br i1 %cmp, label %cond.end, label %cond.false @@ -293,7 +292,7 @@ declare i32 @bar(i32) define void @unsigned_loop(i32 %i) { ; CHECK-LABEL: @unsigned_loop( -; CHECK: @llvm.usub.with.overflow.i32 +; CHECK-NOT: @llvm.usub.with.overflow.i32 entry: %cmp3 = icmp eq i32 %i, 0 br i1 %cmp3, label %while.end, label %while.body.preheader diff --git a/test/Transforms/InstCombine/load-bitcast-select.ll b/test/Transforms/InstCombine/load-bitcast-select.ll index 7e5652878acaf..945f3f7fbfe95 100644 --- a/test/Transforms/InstCombine/load-bitcast-select.ll +++ b/test/Transforms/InstCombine/load-bitcast-select.ll @@ -85,3 +85,26 @@ define void @bitcasted_store(i1 %cond, float* %loadaddr1, float* %loadaddr2, flo store i32 %ld, i32* %int_store_addr ret void } + +define void @bitcasted_minmax_with_select_of_pointers(float* %loadaddr1, float* %loadaddr2, float* %storeaddr) { +; CHECK-LABEL: @bitcasted_minmax_with_select_of_pointers( +; CHECK-NEXT: [[LD1:%.*]] = load float, float* [[LOADADDR1:%.*]], align 4 +; CHECK-NEXT: [[LD2:%.*]] = load float, float* [[LOADADDR2:%.*]], align 4 +; CHECK-NEXT: [[COND:%.*]] = fcmp ogt float [[LD1]], [[LD2]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], float* [[LOADADDR1]], float* [[LOADADDR2]] +; CHECK-NEXT: [[INT_LOAD_ADDR:%.*]] = bitcast float* [[SEL]] to i32* +; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[INT_LOAD_ADDR]], align 4 +; CHECK-NEXT: [[INT_STORE_ADDR:%.*]] = bitcast float* [[STOREADDR:%.*]] to i32* +; CHECK-NEXT: store i32 [[LD]], i32* [[INT_STORE_ADDR]], align 4 +; CHECK-NEXT: ret void +; + %ld1 = load float, float* %loadaddr1, align 4 + %ld2 = load float, float* %loadaddr2, align 4 + %cond = fcmp ogt float %ld1, %ld2 + %sel = select i1 %cond, float* %loadaddr1, float* %loadaddr2 + %int_load_addr = bitcast float* %sel to i32* + %ld = load i32, i32* %int_load_addr, align 4 + %int_store_addr = bitcast float* %storeaddr to i32* + store i32 %ld, i32* %int_store_addr, align 4 + ret void +} diff --git a/test/Transforms/InstCombine/out-of-bounds-indexes.ll b/test/Transforms/InstCombine/out-of-bounds-indexes.ll new file mode 100644 index 0000000000000..a1887d27550ba --- /dev/null +++ b/test/Transforms/InstCombine/out-of-bounds-indexes.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +; Check that we don't crash on unreasonable constant indexes + +define i32 @test_out_of_bounds(i32 %a, i1 %x, i1 %y) { +; CHECK-LABEL: @test_out_of_bounds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A:%.*]], 3 +; CHECK-NEXT: tail call void @llvm.assume(i1 false) +; CHECK-NEXT: ret i32 [[AND1]] +; +entry: + %and1 = and i32 %a, 3 + %B = lshr i32 %and1, -2147483648 + %cmp = icmp eq i32 %B, 1 + tail call void @llvm.assume(i1 %cmp) + ret i32 %and1 +} + +define i128 @test_non64bit(i128 %a) { +; CHECK-LABEL: @test_non64bit( +; CHECK-NEXT: [[AND1:%.*]] = and i128 [[A:%.*]], 3 +; CHECK-NEXT: tail call void @llvm.assume(i1 false) +; CHECK-NEXT: ret i128 [[AND1]] +; + %and1 = and i128 %a, 3 + %B = lshr i128 %and1, -1 + %cmp = icmp eq i128 %B, 1 + tail call void @llvm.assume(i1 %cmp) + ret i128 %and1 +} + +declare void @llvm.assume(i1) diff --git a/test/Transforms/SimplifyCFG/switch_undef.ll b/test/Transforms/SimplifyCFG/switch_undef.ll new file mode 100644 index 0000000000000..22b8bd389783d --- /dev/null +++ b/test/Transforms/SimplifyCFG/switch_undef.ll @@ -0,0 +1,23 @@ +; RUN: opt %s -keep-loops=false -switch-to-lookup=true -simplifycfg -S | FileCheck %s + +define void @f6() #0 { +; CHECK-LABEL: entry: + +entry: + br label %for.cond.i + +for.cond.i: ; preds = %f1.exit.i, %entry + switch i16 undef, label %f1.exit.i [ + i16 -1, label %cond.false.i3.i + i16 1, label %cond.false.i3.i + i16 0, label %cond.false.i3.i + ] + +cond.false.i3.i: ; preds = %for.cond.i, %for.cond.i, %for.cond.i + br label %f1.exit.i + +f1.exit.i: ; preds = %cond.false.i3.i, %for.cond.i + %cond.i4.i = phi i16 [ undef, %cond.false.i3.i ], [ 1, %for.cond.i ] + %tobool7.i = icmp ne i16 %cond.i4.i, 0 + br label %for.cond.i +} diff --git a/test/tools/llvm-objcopy/cannot-delete-dest.test b/test/tools/llvm-objcopy/cannot-delete-dest.test new file mode 100644 index 0000000000000..1853049c8acea --- /dev/null +++ b/test/tools/llvm-objcopy/cannot-delete-dest.test @@ -0,0 +1,18 @@ +# REQUIRES: system-windows +# RUN: yaml2obj %s > %t.o +# RUN: rm -f %t2.o +# RUN: cp %t.o %t2.o +# RUN: attrib +r %t2.o + +# This fails because it cannot replace %t2.o +# RUN: not llvm-objcopy %t.o %t2.o + +# But it doesn't leave any temporary files behind. +# RUN: not ls %t2.o.tmp* + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 diff --git a/tools/bugpoint/CMakeLists.txt b/tools/bugpoint/CMakeLists.txt index 8975e67634340..72c597379c8b1 100644 --- a/tools/bugpoint/CMakeLists.txt +++ b/tools/bugpoint/CMakeLists.txt @@ -37,7 +37,7 @@ add_llvm_tool(bugpoint export_executable_symbols(bugpoint) if(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) - target_link_libraries(bugpoint Polly) + target_link_libraries(bugpoint PRIVATE Polly) # Ensure LLVMTarget can resolve dependences in Polly. - target_link_libraries(bugpoint LLVMTarget) + target_link_libraries(bugpoint PRIVATE LLVMTarget) endif(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) diff --git a/tools/dsymutil/CMakeLists.txt b/tools/dsymutil/CMakeLists.txt index 61d78b5094a70..1dcb2116f34b5 100644 --- a/tools/dsymutil/CMakeLists.txt +++ b/tools/dsymutil/CMakeLists.txt @@ -22,5 +22,5 @@ add_llvm_tool(llvm-dsymutil ) IF(APPLE) - target_link_libraries(llvm-dsymutil "-framework CoreFoundation") + target_link_libraries(llvm-dsymutil PRIVATE "-framework CoreFoundation") ENDIF(APPLE) diff --git a/tools/llvm-cfi-verify/CMakeLists.txt b/tools/llvm-cfi-verify/CMakeLists.txt index de6a46e785955..7a008a66770c8 100644 --- a/tools/llvm-cfi-verify/CMakeLists.txt +++ b/tools/llvm-cfi-verify/CMakeLists.txt @@ -15,4 +15,4 @@ add_llvm_tool(llvm-cfi-verify llvm-cfi-verify.cpp) add_subdirectory(lib) -target_link_libraries(llvm-cfi-verify LLVMCFIVerify) +target_link_libraries(llvm-cfi-verify PRIVATE LLVMCFIVerify) diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt index 043a181d6392e..177c98166ef1f 100644 --- a/tools/llvm-objdump/CMakeLists.txt +++ b/tools/llvm-objdump/CMakeLists.txt @@ -23,7 +23,7 @@ add_llvm_tool(llvm-objdump ) if(HAVE_LIBXAR) - target_link_libraries(llvm-objdump ${XAR_LIB}) + target_link_libraries(llvm-objdump PRIVATE ${XAR_LIB}) endif() if(LLVM_INSTALL_BINUTILS_SYMLINKS) diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp index 1f763b93dc28e..4412d6833411f 100644 --- a/tools/llvm-objdump/MachODump.cpp +++ b/tools/llvm-objdump/MachODump.cpp @@ -9636,3 +9636,4 @@ static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue, auto name = info->bindtable->lookup(ReferenceValue); return !name.empty() ? name.data() : nullptr; } + diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt index 518396e36028f..fcc957abaee56 100644 --- a/tools/opt/CMakeLists.txt +++ b/tools/opt/CMakeLists.txt @@ -37,5 +37,5 @@ add_llvm_tool(opt export_executable_symbols(opt) if(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) - target_link_libraries(opt Polly) + target_link_libraries(opt PRIVATE Polly) endif(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) diff --git a/unittests/DebugInfo/CodeView/CMakeLists.txt b/unittests/DebugInfo/CodeView/CMakeLists.txt index 6f504d8149b50..d06ccfaba72a9 100644 --- a/unittests/DebugInfo/CodeView/CMakeLists.txt +++ b/unittests/DebugInfo/CodeView/CMakeLists.txt @@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS set(DebugInfoCodeViewSources RandomAccessVisitorTest.cpp + TypeHashingTest.cpp TypeIndexDiscoveryTest.cpp ) @@ -11,4 +12,4 @@ add_llvm_unittest(DebugInfoCodeViewTests ${DebugInfoCodeViewSources} ) -target_link_libraries(DebugInfoCodeViewTests LLVMTestingSupport) \ No newline at end of file +target_link_libraries(DebugInfoCodeViewTests PRIVATE LLVMTestingSupport) diff --git a/unittests/DebugInfo/CodeView/TypeHashingTest.cpp b/unittests/DebugInfo/CodeView/TypeHashingTest.cpp new file mode 100644 index 0000000000000..5b9dadfb33ff0 --- /dev/null +++ b/unittests/DebugInfo/CodeView/TypeHashingTest.cpp @@ -0,0 +1,156 @@ +//===- llvm/unittest/DebugInfo/CodeView/TypeHashingTest.cpp ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/TypeHashing.h" +#include "llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h" + +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::codeview; + +static TypeIndex createPointerRecord(AppendingTypeTableBuilder &Builder, + TypeIndex TI) { + PointerRecord PR(TypeRecordKind::Pointer); + PR.setAttrs(PointerKind::Near32, PointerMode::Pointer, PointerOptions::None, + 4); + PR.ReferentType = TI; + return Builder.writeLeafType(PR); +} + +static TypeIndex createArgListRecord(AppendingTypeTableBuilder &Builder, + TypeIndex Q, TypeIndex R) { + ArgListRecord AR(TypeRecordKind::ArgList); + AR.ArgIndices.push_back(Q); + AR.ArgIndices.push_back(R); + return Builder.writeLeafType(AR); +} + +static TypeIndex createProcedureRecord(AppendingTypeTableBuilder &Builder, + uint32_t ParamCount, TypeIndex Return, + TypeIndex ArgList) { + ProcedureRecord PR(TypeRecordKind::Procedure); + PR.ArgumentList = ArgList; + PR.CallConv = CallingConvention::NearC; + PR.Options = FunctionOptions::None; + PR.ParameterCount = ParamCount; + PR.ReturnType = Return; + return Builder.writeLeafType(PR); +} + +static ArrayRef hash_of(ArrayRef Hashes, + TypeIndex TI) { + return Hashes[TI.toArrayIndex()].Hash; +} + +static void verifyHashUniqueness(ArrayRef Hashes) { + assert(!Hashes.empty()); + + for (size_t I = 0; I < Hashes.size() - 1; ++I) { + for (size_t J = I + 1; J < Hashes.size(); ++J) { + EXPECT_NE(Hashes[I].Hash, Hashes[J].Hash); + } + } +} + +TEST(TypeHashingTest, ContentHash) { + SimpleTypeSerializer Serializer; + + TypeIndex CharStar(SimpleTypeKind::SignedCharacter, + SimpleTypeMode::NearPointer32); + + BumpPtrAllocator Alloc; + AppendingTypeTableBuilder Ordering1(Alloc); + AppendingTypeTableBuilder Ordering2(Alloc); + + TypeIndex CharP(SimpleTypeKind::SignedCharacter, SimpleTypeMode::NearPointer); + TypeIndex IntP(SimpleTypeKind::Int32, SimpleTypeMode::NearPointer); + TypeIndex DoubleP(SimpleTypeKind::Float64, SimpleTypeMode::NearPointer); + + // We're going to the same type sequence with two different orderings, and + // then confirm all records are hashed the same. + + TypeIndex CharPP[2]; + TypeIndex IntPP[2]; + TypeIndex IntPPP[2]; + TypeIndex DoublePP[2]; + TypeIndex Args[2]; + TypeIndex Proc[2]; + + // Ordering 1 + // ---------------------------------------- + // LF_POINTER 0x1000 {char**} + // Referent = char* + // LF_POINTER 0x1001 {int**} + // Referent = int* + // LF_POINTER 0x1002 {int***} + // Referent = 0x1001 + // LF_ARGLIST 0x1003 {(char**, int***)} + // Arg[0] = 0x1000 + // Arg[1] = 0x1002 + // LF_PROCEDURE 0x1004 {int** func(char**, int***)} + // ArgList = 0x1003 + // ReturnType = 0x1001 + std::vector Ordering1Hashes; + CharPP[0] = createPointerRecord(Ordering1, CharP); + IntPP[0] = createPointerRecord(Ordering1, IntP); + IntPPP[0] = createPointerRecord(Ordering1, IntPP[0]); + Args[0] = createArgListRecord(Ordering1, CharPP[0], IntPPP[0]); + Proc[0] = createProcedureRecord(Ordering1, 2, IntPP[0], Args[0]); + + ASSERT_EQ(0x1000U, CharPP[0].getIndex()); + ASSERT_EQ(0x1001U, IntPP[0].getIndex()); + ASSERT_EQ(0x1002U, IntPPP[0].getIndex()); + ASSERT_EQ(0x1003U, Args[0].getIndex()); + ASSERT_EQ(0x1004U, Proc[0].getIndex()); + + auto Hashes1 = GloballyHashedType::hashTypes(Ordering1.records()); + + // Ordering 2 + // ---------------------------------------- + // LF_POINTER 0x1000 {int**} + // Referent = int* + // LF_POINTER 0x1001 {int***} + // Referent = 0x1000 + // LF_POINTER 0x1002 {char**} + // Referent = char* + // LF_POINTER 0x1003 {double**} + // Referent = double* + // LF_ARGLIST 0x1004 {(char**, int***)} + // Arg[0] = 0x1002 + // Arg[1] = 0x1001 + // LF_PROCEDURE 0x1005 {int** func(char**, int***)} + // ArgList = 0x1004 + // ReturnType = 0x1000 + IntPP[1] = createPointerRecord(Ordering2, IntP); + IntPPP[1] = createPointerRecord(Ordering2, IntPP[1]); + CharPP[1] = createPointerRecord(Ordering2, CharP); + DoublePP[1] = createPointerRecord(Ordering2, DoubleP); + Args[1] = createArgListRecord(Ordering2, CharPP[1], IntPPP[1]); + Proc[1] = createProcedureRecord(Ordering2, 2, IntPP[1], Args[1]); + auto Hashes2 = GloballyHashedType::hashTypes(Ordering2.records()); + + ASSERT_EQ(0x1000U, IntPP[1].getIndex()); + ASSERT_EQ(0x1001U, IntPPP[1].getIndex()); + ASSERT_EQ(0x1002U, CharPP[1].getIndex()); + ASSERT_EQ(0x1003U, DoublePP[1].getIndex()); + ASSERT_EQ(0x1004U, Args[1].getIndex()); + ASSERT_EQ(0x1005U, Proc[1].getIndex()); + + // Sanity check to make sure all same-ordering hashes are different + // from each other. + verifyHashUniqueness(Hashes1); + verifyHashUniqueness(Hashes2); + + EXPECT_EQ(hash_of(Hashes1, IntPP[0]), hash_of(Hashes2, IntPP[1])); + EXPECT_EQ(hash_of(Hashes1, IntPPP[0]), hash_of(Hashes2, IntPPP[1])); + EXPECT_EQ(hash_of(Hashes1, CharPP[0]), hash_of(Hashes2, CharPP[1])); + EXPECT_EQ(hash_of(Hashes1, Args[0]), hash_of(Hashes2, Args[1])); + EXPECT_EQ(hash_of(Hashes1, Proc[0]), hash_of(Hashes2, Proc[1])); +} diff --git a/unittests/DebugInfo/DWARF/CMakeLists.txt b/unittests/DebugInfo/DWARF/CMakeLists.txt index 1966472a9467d..f490097a21a75 100644 --- a/unittests/DebugInfo/DWARF/CMakeLists.txt +++ b/unittests/DebugInfo/DWARF/CMakeLists.txt @@ -18,4 +18,4 @@ add_llvm_unittest(DebugInfoDWARFTests ${DebugInfoSources} ) -target_link_libraries(DebugInfoDWARFTests LLVMTestingSupport) +target_link_libraries(DebugInfoDWARFTests PRIVATE LLVMTestingSupport) diff --git a/unittests/DebugInfo/MSF/CMakeLists.txt b/unittests/DebugInfo/MSF/CMakeLists.txt index 25e011178cddb..20f3b2ab3dcdd 100644 --- a/unittests/DebugInfo/MSF/CMakeLists.txt +++ b/unittests/DebugInfo/MSF/CMakeLists.txt @@ -12,4 +12,4 @@ add_llvm_unittest(DebugInfoMSFTests ${DebugInfoMSFSources} ) -target_link_libraries(DebugInfoMSFTests LLVMTestingSupport) +target_link_libraries(DebugInfoMSFTests PRIVATE LLVMTestingSupport) diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt index e2db58ff93754..b19ee2cf43a51 100644 --- a/unittests/DebugInfo/PDB/CMakeLists.txt +++ b/unittests/DebugInfo/PDB/CMakeLists.txt @@ -14,4 +14,4 @@ add_llvm_unittest(DebugInfoPDBTests ${DebugInfoPDBSources} ) -target_link_libraries(DebugInfoPDBTests LLVMTestingSupport) +target_link_libraries(DebugInfoPDBTests PRIVATE LLVMTestingSupport) diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt index e7e3034905ecd..28e07959ac7bc 100644 --- a/unittests/ExecutionEngine/Orc/CMakeLists.txt +++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_unittest(OrcJITTests RemoteObjectLayerTest.cpp RPCUtilsTest.cpp RTDyldObjectLinkingLayerTest.cpp + SymbolStringPoolTest.cpp ) -target_link_libraries(OrcJITTests ${LLVM_PTHREAD_LIB}) +target_link_libraries(OrcJITTests PRIVATE ${LLVM_PTHREAD_LIB}) diff --git a/unittests/ExecutionEngine/Orc/SymbolStringPoolTest.cpp b/unittests/ExecutionEngine/Orc/SymbolStringPoolTest.cpp new file mode 100644 index 0000000000000..ac79541d50c22 --- /dev/null +++ b/unittests/ExecutionEngine/Orc/SymbolStringPoolTest.cpp @@ -0,0 +1,43 @@ +//===----- SymbolStringPoolTest.cpp - Unit tests for SymbolStringPool -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::orc; + +namespace { + +TEST(SymbolStringPool, UniquingAndEquality) { + SymbolStringPool SP; + auto P1 = SP.intern("hello"); + + std::string S("hel"); + S += "lo"; + auto P2 = SP.intern(S); + + auto P3 = SP.intern("goodbye"); + + EXPECT_EQ(P1, P2) << "Failed to unique entries"; + EXPECT_NE(P1, P3) << "Inequal pooled symbol strings comparing equal"; +} + +TEST(SymbolStringPool, ClearDeadEntries) { + SymbolStringPool SP; + { + auto P1 = SP.intern("s1"); + SP.clearDeadEntries(); + EXPECT_FALSE(SP.empty()) << "\"s1\" entry in pool should still be retained"; + } + SP.clearDeadEntries(); + EXPECT_TRUE(SP.empty()) << "pool should be empty"; +} + +} diff --git a/unittests/IR/ConstantRangeTest.cpp b/unittests/IR/ConstantRangeTest.cpp index 0292f60fe3322..b7e6235cec324 100644 --- a/unittests/IR/ConstantRangeTest.cpp +++ b/unittests/IR/ConstantRangeTest.cpp @@ -715,24 +715,102 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { } } + for (int Const : {0, -1, -2, 1, 2, IntMin4Bits, IntMax4Bits}) { + APInt C(4, Const, true /* = isSigned */); + + auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, C, OBO::NoUnsignedWrap); + + EXPECT_FALSE(NUWRegion.isEmptySet()); + + auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, C, OBO::NoSignedWrap); + + EXPECT_FALSE(NSWRegion.isEmptySet()); + + auto NoWrapRegion = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, C, OBO::NoSignedWrap | OBO::NoUnsignedWrap); + + EXPECT_FALSE(NoWrapRegion.isEmptySet()); + EXPECT_TRUE(NUWRegion.intersectWith(NSWRegion).contains(NoWrapRegion)); + + for (APInt I = NUWRegion.getLower(), E = NUWRegion.getUpper(); I != E; + ++I) { + bool Overflow = false; + (void)I.usub_ov(C, Overflow); + EXPECT_FALSE(Overflow); + } + + for (APInt I = NSWRegion.getLower(), E = NSWRegion.getUpper(); I != E; + ++I) { + bool Overflow = false; + (void)I.ssub_ov(C, Overflow); + EXPECT_FALSE(Overflow); + } + + for (APInt I = NoWrapRegion.getLower(), E = NoWrapRegion.getUpper(); I != E; + ++I) { + bool Overflow = false; + + (void)I.ssub_ov(C, Overflow); + EXPECT_FALSE(Overflow); + + (void)I.usub_ov(C, Overflow); + EXPECT_FALSE(Overflow); + } + } + auto NSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, ConstantRange(32, /* isFullSet = */ true), OBO::NoSignedWrap); EXPECT_TRUE(NSWForAllValues.isSingleElement() && NSWForAllValues.getSingleElement()->isMinValue()); + NSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, ConstantRange(32, /* isFullSet = */ true), + OBO::NoSignedWrap); + EXPECT_TRUE(NSWForAllValues.isSingleElement() && + NSWForAllValues.getSingleElement()->isMaxValue()); + auto NUWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, ConstantRange(32, /* isFullSet = */ true), OBO::NoUnsignedWrap); EXPECT_TRUE(NUWForAllValues.isSingleElement() && NUWForAllValues.getSingleElement()->isMinValue()); + NUWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, ConstantRange(32, /* isFullSet = */ true), + OBO::NoUnsignedWrap); + EXPECT_TRUE(NUWForAllValues.isSingleElement() && + NUWForAllValues.getSingleElement()->isMaxValue()); + auto NUWAndNSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, ConstantRange(32, /* isFullSet = */ true), OBO::NoUnsignedWrap | OBO::NoSignedWrap); EXPECT_TRUE(NUWAndNSWForAllValues.isSingleElement() && NUWAndNSWForAllValues.getSingleElement()->isMinValue()); + NUWAndNSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, ConstantRange(32, /* isFullSet = */ true), + OBO::NoUnsignedWrap | OBO::NoSignedWrap); + EXPECT_TRUE(NUWAndNSWForAllValues.isSingleElement() && + NUWAndNSWForAllValues.getSingleElement()->isMaxValue()); + + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, APInt(32, 0), OBO::NoUnsignedWrap).isFullSet()); + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, APInt(32, 0), OBO::NoSignedWrap).isFullSet()); + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, APInt(32, 0), + OBO::NoUnsignedWrap | OBO::NoSignedWrap).isFullSet()); + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, APInt(32, 0), OBO::NoUnsignedWrap).isFullSet()); + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, APInt(32, 0), OBO::NoSignedWrap).isFullSet()); + EXPECT_TRUE(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, APInt(32, 0), + OBO::NoUnsignedWrap | OBO::NoSignedWrap).isFullSet()); + ConstantRange OneToFive(APInt(32, 1), APInt(32, 6)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, OneToFive, OBO::NoSignedWrap), @@ -745,6 +823,17 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { ConstantRange::makeGuaranteedNoWrapRegion( Instruction::Add, OneToFive, OBO::NoUnsignedWrap | OBO::NoSignedWrap), ConstantRange(APInt::getMinValue(32), APInt::getSignedMaxValue(32) - 4)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, OneToFive, OBO::NoSignedWrap), + ConstantRange(APInt::getSignedMinValue(32) + 5, + APInt::getSignedMinValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, OneToFive, OBO::NoUnsignedWrap), + ConstantRange(APInt::getMinValue(32) + 5, APInt::getMinValue(32))); + EXPECT_EQ( + ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, OneToFive, OBO::NoUnsignedWrap | OBO::NoSignedWrap), + ConstantRange(APInt::getMinValue(32) + 5, APInt::getSignedMinValue(32))); ConstantRange MinusFiveToMinusTwo(APInt(32, -5), APInt(32, -1)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( @@ -758,6 +847,19 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { Instruction::Add, MinusFiveToMinusTwo, OBO::NoUnsignedWrap | OBO::NoSignedWrap), ConstantRange(APInt(32, 0), APInt(32, 2))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusFiveToMinusTwo, OBO::NoSignedWrap), + ConstantRange(APInt::getSignedMinValue(32), + APInt::getSignedMaxValue(32) - 4)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusFiveToMinusTwo, OBO::NoUnsignedWrap), + ConstantRange(APInt::getMaxValue(32) - 1, + APInt::getMinValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusFiveToMinusTwo, + OBO::NoUnsignedWrap | OBO::NoSignedWrap), + ConstantRange(APInt::getMaxValue(32) - 1, + APInt::getMinValue(32))); ConstantRange MinusOneToOne(APInt(32, -1), APInt(32, 2)); EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( @@ -771,6 +873,43 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) { Instruction::Add, MinusOneToOne, OBO::NoUnsignedWrap | OBO::NoSignedWrap), ConstantRange(APInt(32, 0), APInt(32, 1))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusOneToOne, OBO::NoSignedWrap), + ConstantRange(APInt::getSignedMinValue(32) + 1, + APInt::getSignedMinValue(32) - 1)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusOneToOne, OBO::NoUnsignedWrap), + ConstantRange(APInt::getMaxValue(32), + APInt::getMinValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, MinusOneToOne, + OBO::NoUnsignedWrap | OBO::NoSignedWrap), + ConstantRange(APInt::getMaxValue(32), + APInt::getMinValue(32))); + + ConstantRange One(APInt(32, 1), APInt(32, 2)); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, One, OBO::NoSignedWrap), + ConstantRange(APInt::getSignedMinValue(32), + APInt::getSignedMaxValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, One, OBO::NoUnsignedWrap), + ConstantRange(APInt::getMinValue(32), APInt::getMaxValue(32))); + EXPECT_EQ( + ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Add, One, OBO::NoUnsignedWrap | OBO::NoSignedWrap), + ConstantRange(APInt(32, 0), APInt::getSignedMaxValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, One, OBO::NoSignedWrap), + ConstantRange(APInt::getSignedMinValue(32) + 1, + APInt::getSignedMinValue(32))); + EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, One, OBO::NoUnsignedWrap), + ConstantRange(APInt::getMinValue(32) + 1, APInt::getMinValue(32))); + EXPECT_EQ( + ConstantRange::makeGuaranteedNoWrapRegion( + Instruction::Sub, One, OBO::NoUnsignedWrap | OBO::NoSignedWrap), + ConstantRange(APInt::getMinValue(32) + 1, APInt::getSignedMinValue(32))); } TEST(ConstantRange, GetEquivalentICmp) { diff --git a/unittests/ProfileData/CMakeLists.txt b/unittests/ProfileData/CMakeLists.txt index 80f9ada7b83a3..366ed5482bf2c 100644 --- a/unittests/ProfileData/CMakeLists.txt +++ b/unittests/ProfileData/CMakeLists.txt @@ -11,4 +11,4 @@ add_llvm_unittest(ProfileDataTests SampleProfTest.cpp ) -target_link_libraries(ProfileDataTests LLVMTestingSupport) +target_link_libraries(ProfileDataTests PRIVATE LLVMTestingSupport) diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt index f2a9b472d9071..24500e66ae741 100644 --- a/unittests/Support/CMakeLists.txt +++ b/unittests/Support/CMakeLists.txt @@ -74,6 +74,6 @@ add_llvm_unittest(SupportTests set_source_files_properties(AlignOfTest.cpp PROPERTIES COMPILE_FLAGS -w) # ManagedStatic.cpp uses . -target_link_libraries(SupportTests LLVMTestingSupport ${LLVM_PTHREAD_LIB}) +target_link_libraries(SupportTests PRIVATE LLVMTestingSupport ${LLVM_PTHREAD_LIB}) add_subdirectory(DynamicLibrary) diff --git a/unittests/Support/DynamicLibrary/CMakeLists.txt b/unittests/Support/DynamicLibrary/CMakeLists.txt index c6201b1ad319d..4f060e4020d12 100644 --- a/unittests/Support/DynamicLibrary/CMakeLists.txt +++ b/unittests/Support/DynamicLibrary/CMakeLists.txt @@ -4,7 +4,7 @@ add_library(DynamicLibraryLib STATIC ExportedFuncs.cxx) set_target_properties(DynamicLibraryLib PROPERTIES FOLDER "Tests") add_llvm_unittest(DynamicLibraryTests DynamicLibraryTest.cpp) -target_link_libraries(DynamicLibraryTests DynamicLibraryLib) +target_link_libraries(DynamicLibraryTests PRIVATE DynamicLibraryLib) export_executable_symbols(DynamicLibraryTests) function(dynlib_add_module NAME) diff --git a/unittests/Support/TarWriterTest.cpp b/unittests/Support/TarWriterTest.cpp index 6007e73ffafc2..901dd906ca78a 100644 --- a/unittests/Support/TarWriterTest.cpp +++ b/unittests/Support/TarWriterTest.cpp @@ -120,4 +120,60 @@ TEST_F(TarWriterTest, Pax) { StringRef Pax = StringRef((char *)(Buf.data() + 512), 512); EXPECT_TRUE(Pax.startswith("211 path=/" + std::string(200, 'x'))); } + +TEST_F(TarWriterTest, SingleFile) { + SmallString<128> Path; + std::error_code EC = + sys::fs::createTemporaryFile("TarWriterTest", "tar", Path); + EXPECT_FALSE((bool)EC); + + Expected> TarOrErr = TarWriter::create(Path, ""); + EXPECT_TRUE((bool)TarOrErr); + std::unique_ptr Tar = std::move(*TarOrErr); + Tar->append("FooPath", "foo"); + Tar.reset(); + + uint64_t TarSize; + EC = sys::fs::file_size(Path, TarSize); + EXPECT_FALSE((bool)EC); + EXPECT_EQ(TarSize, 2048ULL); +} + +TEST_F(TarWriterTest, NoDuplicate) { + SmallString<128> Path; + std::error_code EC = + sys::fs::createTemporaryFile("TarWriterTest", "tar", Path); + EXPECT_FALSE((bool)EC); + + Expected> TarOrErr = TarWriter::create(Path, ""); + EXPECT_TRUE((bool)TarOrErr); + std::unique_ptr Tar = std::move(*TarOrErr); + Tar->append("FooPath", "foo"); + Tar->append("BarPath", "bar"); + Tar.reset(); + + uint64_t TarSize; + EC = sys::fs::file_size(Path, TarSize); + EXPECT_FALSE((bool)EC); + EXPECT_EQ(TarSize, 3072ULL); +} + +TEST_F(TarWriterTest, Duplicate) { + SmallString<128> Path; + std::error_code EC = + sys::fs::createTemporaryFile("TarWriterTest", "tar", Path); + EXPECT_FALSE((bool)EC); + + Expected> TarOrErr = TarWriter::create(Path, ""); + EXPECT_TRUE((bool)TarOrErr); + std::unique_ptr Tar = std::move(*TarOrErr); + Tar->append("FooPath", "foo"); + Tar->append("FooPath", "bar"); + Tar.reset(); + + uint64_t TarSize; + EC = sys::fs::file_size(Path, TarSize); + EXPECT_FALSE((bool)EC); + EXPECT_EQ(TarSize, 2048ULL); } +} // namespace diff --git a/unittests/tools/llvm-cfi-verify/CMakeLists.txt b/unittests/tools/llvm-cfi-verify/CMakeLists.txt index adb7a55327ae8..e47bbdf7f1312 100644 --- a/unittests/tools/llvm-cfi-verify/CMakeLists.txt +++ b/unittests/tools/llvm-cfi-verify/CMakeLists.txt @@ -14,4 +14,4 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(CFIVerifyTests FileAnalysis.cpp GraphBuilder.cpp) -target_link_libraries(CFIVerifyTests LLVMCFIVerify) +target_link_libraries(CFIVerifyTests PRIVATE LLVMCFIVerify) diff --git a/utils/FileCheck/CMakeLists.txt b/utils/FileCheck/CMakeLists.txt index 999320f78af28..32e948a1a19eb 100644 --- a/utils/FileCheck/CMakeLists.txt +++ b/utils/FileCheck/CMakeLists.txt @@ -2,4 +2,4 @@ add_llvm_utility(FileCheck FileCheck.cpp ) -target_link_libraries(FileCheck LLVMSupport) +target_link_libraries(FileCheck PRIVATE LLVMSupport) diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp index 288f78b5d0c27..062fe59d82032 100644 --- a/utils/TableGen/GlobalISelEmitter.cpp +++ b/utils/TableGen/GlobalISelEmitter.cpp @@ -2378,9 +2378,8 @@ class GlobalISelEmitter { CodeGenRegBank CGRegs; /// Keep track of the equivalence between SDNodes and Instruction by mapping - /// SDNodes to the GINodeEquiv mapping. We map to the GINodeEquiv in case we - /// need to check for attributes on the relation such as (the now removed) - /// CheckMMOIsNonAtomic. + /// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to + /// check for attributes on the relation such as CheckMMOIsNonAtomic. /// This is defined using 'GINodeEquiv' in the target description. DenseMap NodeEquivs; @@ -2399,8 +2398,6 @@ class GlobalISelEmitter { Record *findNodeEquiv(Record *N) const; Error importRulePredicates(RuleMatcher &M, ArrayRef Predicates); - Error importInstructionPredicates(InstructionMatcher &InsnMatcher, - const TreePatternNode *Src) const; Expected createAndImportSelDAGMatcher( RuleMatcher &Rule, InstructionMatcher &InsnMatcher, const TreePatternNode *Src, unsigned &TempOpIdx) const; @@ -2486,8 +2483,45 @@ GlobalISelEmitter::importRulePredicates(RuleMatcher &M, return Error::success(); } -Error GlobalISelEmitter::importInstructionPredicates( - InstructionMatcher &InsnMatcher, const TreePatternNode *Src) const { +Expected GlobalISelEmitter::createAndImportSelDAGMatcher( + RuleMatcher &Rule, InstructionMatcher &InsnMatcher, + const TreePatternNode *Src, unsigned &TempOpIdx) const { + Record *SrcGIEquivOrNull = nullptr; + const CodeGenInstruction *SrcGIOrNull = nullptr; + + // Start with the defined operands (i.e., the results of the root operator). + if (Src->getExtTypes().size() > 1) + return failedImport("Src pattern has multiple results"); + + if (Src->isLeaf()) { + Init *SrcInit = Src->getLeafValue(); + if (isa(SrcInit)) { + InsnMatcher.addPredicate( + &Target.getInstruction(RK.getDef("G_CONSTANT"))); + } else + return failedImport( + "Unable to deduce gMIR opcode to handle Src (which is a leaf)"); + } else { + SrcGIEquivOrNull = findNodeEquiv(Src->getOperator()); + if (!SrcGIEquivOrNull) + return failedImport("Pattern operator lacks an equivalent Instruction" + + explainOperator(Src->getOperator())); + SrcGIOrNull = &Target.getInstruction(SrcGIEquivOrNull->getValueAsDef("I")); + + // The operators look good: match the opcode + InsnMatcher.addPredicate(SrcGIOrNull); + } + + unsigned OpIdx = 0; + for (const TypeSetByHwMode &VTy : Src->getExtTypes()) { + // Results don't have a name unless they are the root node. The caller will + // set the name if appropriate. + OperandMatcher &OM = InsnMatcher.addOperand(OpIdx++, "", TempOpIdx); + if (auto Error = OM.addTypeCheckPredicate(VTy, false /* OperandIsAPointer */)) + return failedImport(toString(std::move(Error)) + + " for result of Src pattern operator"); + } + for (const auto &Predicate : Src->getPredicateFns()) { if (Predicate.isAlwaysTrue()) continue; @@ -2576,50 +2610,9 @@ Error GlobalISelEmitter::importInstructionPredicates( return failedImport("Src pattern child has predicate (" + explainPredicates(Src) + ")"); } + if (SrcGIEquivOrNull && SrcGIEquivOrNull->getValueAsBit("CheckMMOIsNonAtomic")) + InsnMatcher.addPredicate("NotAtomic"); - return Error::success(); -} - -Expected GlobalISelEmitter::createAndImportSelDAGMatcher( - RuleMatcher &Rule, InstructionMatcher &InsnMatcher, - const TreePatternNode *Src, unsigned &TempOpIdx) const { - Record *SrcGIEquivOrNull = nullptr; - const CodeGenInstruction *SrcGIOrNull = nullptr; - - // Start with the defined operands (i.e., the results of the root operator). - if (Src->getExtTypes().size() > 1) - return failedImport("Src pattern has multiple results"); - - if (Src->isLeaf()) { - Init *SrcInit = Src->getLeafValue(); - if (isa(SrcInit)) { - InsnMatcher.addPredicate( - &Target.getInstruction(RK.getDef("G_CONSTANT"))); - } else - return failedImport( - "Unable to deduce gMIR opcode to handle Src (which is a leaf)"); - } else { - SrcGIEquivOrNull = findNodeEquiv(Src->getOperator()); - if (!SrcGIEquivOrNull) - return failedImport("Pattern operator lacks an equivalent Instruction" + - explainOperator(Src->getOperator())); - SrcGIOrNull = &Target.getInstruction(SrcGIEquivOrNull->getValueAsDef("I")); - - // The operators look good: match the opcode - InsnMatcher.addPredicate(SrcGIOrNull); - } - - unsigned OpIdx = 0; - for (const TypeSetByHwMode &VTy : Src->getExtTypes()) { - // Results don't have a name unless they are the root node. The caller will - // set the name if appropriate. - OperandMatcher &OM = InsnMatcher.addOperand(OpIdx++, "", TempOpIdx); - if (auto Error = OM.addTypeCheckPredicate(VTy, false /* OperandIsAPointer */)) - return failedImport(toString(std::move(Error)) + - " for result of Src pattern operator"); - } - - if (Src->isLeaf()) { Init *SrcInit = Src->getLeafValue(); if (IntInit *SrcIntInit = dyn_cast(SrcInit)) { @@ -2638,8 +2631,6 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( // here since we don't support ImmLeaf predicates yet. However, we still // need to note the hidden operand to get GIM_CheckNumOperands correct. InsnMatcher.addOperand(OpIdx++, "", TempOpIdx); - if (auto Error = importInstructionPredicates(InsnMatcher, Src)) - return std::move(Error); return InsnMatcher; } @@ -2674,8 +2665,6 @@ Expected GlobalISelEmitter::createAndImportSelDAGMatcher( } } - if (auto Error = importInstructionPredicates(InsnMatcher, Src)) - return std::move(Error); return InsnMatcher; } @@ -3709,40 +3698,6 @@ TreePatternNode *GlobalISelEmitter::fixupPatternNode(TreePatternNode *N) { return Ext; } } - - if (N->getOperator()->getName() == "atomic_load") { - // If it's a atomic-load we need to adapt the pattern slightly. We need - // to split the node into (anyext (atomic_load ...)), and then apply the - // <> predicate by updating the result type of the load. - // - // For example: - // (atomic_load:[i32] [iPTR])<> - // must be transformed into: - // (anyext:[i32] (atomic_load:[i16] [iPTR])) - - std::vector Predicates; - Record *MemVT = nullptr; - for (const auto &P : N->getPredicateFns()) { - if (P.isAtomic() && P.getMemoryVT()) { - MemVT = P.getMemoryVT(); - continue; - } - Predicates.push_back(P); - } - - if (MemVT) { - TypeSetByHwMode ValueTy = getValueType(MemVT); - if (ValueTy != N->getType(0)) { - TreePatternNode *Ext = - new TreePatternNode(RK.getDef("anyext"), {N}, 1); - Ext->setType(0, N->getType(0)); - N->clearPredicateFns(); - N->setPredicateFns(Predicates); - N->setType(0, ValueTy); - return Ext; - } - } - } } return N; diff --git a/utils/lit/lit/llvm/config.py b/utils/lit/lit/llvm/config.py index 3c9a2cc559c1e..554da93f110be 100644 --- a/utils/lit/lit/llvm/config.py +++ b/utils/lit/lit/llvm/config.py @@ -228,7 +228,8 @@ def get_clang_has_lsan(self, clang, triple): minor_version_number = int(version_regex.group(2)) patch_version_number = int(version_regex.group(3)) if 'Apple LLVM' in version_string: - return major_version_number >= 9 and (minor_version_number > 0 or patch_version_number > 0) + # Apple LLVM doesn't yet support LSan + return False else: return major_version_number >= 5 diff --git a/utils/not/CMakeLists.txt b/utils/not/CMakeLists.txt index 4a92348ba0f9d..29c7b0218521d 100644 --- a/utils/not/CMakeLists.txt +++ b/utils/not/CMakeLists.txt @@ -2,4 +2,4 @@ add_llvm_utility(not not.cpp ) -target_link_libraries(not LLVMSupport) +target_link_libraries(not PRIVATE LLVMSupport) diff --git a/utils/yaml-bench/CMakeLists.txt b/utils/yaml-bench/CMakeLists.txt index 403182ceee2a8..cd04b33d0a296 100644 --- a/utils/yaml-bench/CMakeLists.txt +++ b/utils/yaml-bench/CMakeLists.txt @@ -2,4 +2,4 @@ add_llvm_utility(yaml-bench YAMLBench.cpp ) -target_link_libraries(yaml-bench LLVMSupport) +target_link_libraries(yaml-bench PRIVATE LLVMSupport)