diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 84112b8b18a81..ed3d9bbc77256 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -10,12 +10,11 @@ #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" - include "mlir/IR/EnumAttr.td" class XeGPUAttr traits = [], string baseCppClass = "::mlir::Attribute"> - : AttrDef { + : AttrDef { let mnemonic = attrMnemonic; } @@ -49,7 +48,7 @@ def XeGPU_SgMapAttr: XeGPUAttr<"SubGroupMap", "sg_map"> { def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { let parameters = (ins - DefaultValuedParameter<"xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope, + DefaultValuedParameter<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">: $memory_scope, DefaultValuedParameter<"int", "1">: $array_length, DefaultValuedParameter<"bool", "true">: $boundary_check, OptionalParameter<"xegpu::ScatteredAttr">: $scattered, @@ -58,7 +57,7 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { let builders = [ AttrBuilder<(ins - CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">:$memory_scope, + CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">:$memory_scope, CArg<"int", "1">:$array_length, CArg<"xegpu::ScatteredAttr", "{}">:$scattered, CArg<"xegpu::SubGroupMapAttr", "{}">:$map @@ -72,65 +71,80 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { let hasCustomAssemblyFormat = true; } -def XeGPU_ArgTypeAttr : I32EnumAttr< - "ArgType", "", [ I32EnumAttrCase<"Vector", 0, "vector">, - I32EnumAttrCase<"Scalar", 1, "scalar"> ]> { - let cppNamespace = "::mlir::xegpu"; +def ARG_TYPE_VECTOR : I32EnumAttrCase<"VECTOR", 0, "vector">; +def ARG_TYPE_SCALAR : I32EnumAttrCase<"SCALAR", 1, "scalar">; +def XeGPU_ArgTypeKind : I32EnumAttr<"ArgTypeKind", + "Argument type for Invoke_SIMD op", + [ARG_TYPE_VECTOR, ARG_TYPE_SCALAR]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; } -def XeGPU_ModeAttr : I32EnumAttr< - "Mode", "", [ I32EnumAttrCase<"SIMT", 0, "simt">, - I32EnumAttrCase<"VC", 1, "vc"> ]> { - let cppNamespace = "::mlir::xegpu"; +def MODE_SIMT : I32EnumAttrCase<"SIMT", 0, "simt">; +def MODE_VC : I32EnumAttrCase<"VC", 1, "vc">; +def XeGPU_ModeKind : I32EnumAttr<"ModeKind", + "The Mode an operator runs on", + [MODE_SIMT, MODE_VC]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; } -def XeGPU_MemoryScopeAttr : I32EnumAttr< - "MemoryScope", "", [ I32EnumAttrCase<"GLOBAL", 0, "global">, - I32EnumAttrCase<"SLM", 1, "slm"> ]> { - let cppNamespace = "::mlir::xegpu"; +def MEMORY_SCOPE_GLOBAL: I32EnumAttrCase<"GLOBAL", 0, "global">; +def MEMORY_SCOPE_SHARED: I32EnumAttrCase<"SLM", 1, "slm">; +def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", + "The scope of the memory the tensor descritor is created for", + [MEMORY_SCOPE_GLOBAL, MEMORY_SCOPE_SHARED]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; } -def XeGPU_CacheReadAttr : I32EnumAttr< - "CacheReadHint", "", [ I32EnumAttrCase<"UNCACHED", 0, "uncached">, - I32EnumAttrCase<"CACHED", 1, "cached">, - I32EnumAttrCase<"STREAMING", 2, "streaming">, - I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate"> ]> { +def CACHE_KIND_CACHED: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write +def CACHE_KIND_UNCACHED: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write +def CACHE_KIND_STREAMING: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only +def CACHE_KIND_INVALIDATE: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only +def CACHE_KIND_WRITE_BACK: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only +def CACHE_KIND_WRITE_THROUGH: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only - let cppNamespace = "::mlir::xegpu"; -} -def XeGPU_CacheWriteAttr : I32EnumAttr< - "CacheWriteHint", "", [ I32EnumAttrCase<"UNCACHED", 0, "uncached">, - I32EnumAttrCase<"WRITE_THROUGH", 1, "write_through">, - I32EnumAttrCase<"WRITE_BACK", 2, "write_back">, - I32EnumAttrCase<"STREAMING", 3, "streaming"> ]> { - let cppNamespace = "::mlir::xegpu"; +def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", + [CACHE_KIND_CACHED, CACHE_KIND_UNCACHED, + CACHE_KIND_STREAMING, CACHE_KIND_INVALIDATE, + CACHE_KIND_WRITE_BACK, CACHE_KIND_WRITE_THROUGH]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; } +def XeGPU_ArgTypeAttr : EnumAttr; +def XeGPU_ModeAttr : EnumAttr; +def XeGPU_MemoryScopeAttr : EnumAttr; +def XeGPU_CacheAttr : EnumAttr; + // RMW kind attribute -def ATOMIC_RMW_KIND_ADDF : I64EnumAttrCase<"addf", 0>; -def ATOMIC_RMW_KIND_ADDI : I64EnumAttrCase<"addi", 1>; -def ATOMIC_RMW_KIND_ASSIGN : I64EnumAttrCase<"assign", 2>; -def ATOMIC_RMW_KIND_MAXF : I64EnumAttrCase<"maxf", 3>; -def ATOMIC_RMW_KIND_MAXS : I64EnumAttrCase<"maxs", 4>; -def ATOMIC_RMW_KIND_MAXU : I64EnumAttrCase<"maxu", 5>; -def ATOMIC_RMW_KIND_MINF : I64EnumAttrCase<"minf", 6>; -def ATOMIC_RMW_KIND_MINS : I64EnumAttrCase<"mins", 7>; -def ATOMIC_RMW_KIND_MINU : I64EnumAttrCase<"minu", 8>; -def ATOMIC_RMW_KIND_MULF : I64EnumAttrCase<"mulf", 9>; -def ATOMIC_RMW_KIND_MULI : I64EnumAttrCase<"muli", 10>; -def ATOMIC_RMW_KIND_ORI : I64EnumAttrCase<"ori", 11>; -def ATOMIC_RMW_KIND_ANDI : I64EnumAttrCase<"andi", 12>; - -def XeGPU_AtomicRMWKindAttr : I64EnumAttr< - "AtomicRMWKind", "", - [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN, - ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU, - ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU, - ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI, - ATOMIC_RMW_KIND_ANDI]> { +def ATOMIC_RMW_KIND_ADDF : I32EnumAttrCase<"addf", 0>; +def ATOMIC_RMW_KIND_ADDI : I32EnumAttrCase<"addi", 1>; +def ATOMIC_RMW_KIND_ASSIGN : I32EnumAttrCase<"assign", 2>; +def ATOMIC_RMW_KIND_MAXF : I32EnumAttrCase<"maxf", 3>; +def ATOMIC_RMW_KIND_MAXS : I32EnumAttrCase<"maxs", 4>; +def ATOMIC_RMW_KIND_MAXU : I32EnumAttrCase<"maxu", 5>; +def ATOMIC_RMW_KIND_MINF : I32EnumAttrCase<"minf", 6>; +def ATOMIC_RMW_KIND_MINS : I32EnumAttrCase<"mins", 7>; +def ATOMIC_RMW_KIND_MINU : I32EnumAttrCase<"minu", 8>; +def ATOMIC_RMW_KIND_MULF : I32EnumAttrCase<"mulf", 9>; +def ATOMIC_RMW_KIND_MULI : I32EnumAttrCase<"muli", 10>; +def ATOMIC_RMW_KIND_ORI : I32EnumAttrCase<"ori", 11>; +def ATOMIC_RMW_KIND_ANDI : I32EnumAttrCase<"andi", 12>; + +def XeGPU_AtomicRMWKind : I32EnumAttr<"AtomicRMWKind", + "Operation type for AtomicRMW", + [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN, + ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU, + ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU, + ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI, + ATOMIC_RMW_KIND_ANDI]> { + let genSpecializedAttr = 0; let cppNamespace = "::mlir::xegpu"; } +def XeGPU_AtomicRMWKindAttr : EnumAttr; #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td index ae29f87a8812a..f85ccb32cc43b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td @@ -22,31 +22,23 @@ include "mlir/Interfaces/CopyOpInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/ShapedOpInterfaces.td" - -// Provide a definition of the 'XeGPU' dialect in the ODS framework so that we -// can define our operations. -def XeGPUDialect : Dialect { - // The namespace of our dialect +def XeGPU_Dialect : Dialect { let name = "xegpu"; - - // A short one-line summary of our dialect. + let cppNamespace = "::mlir::xegpu"; let summary = "The XeGPU dialect that models Intel GPU's ISA"; - - // A longer description of our dialect. let description = [{ - The XeGPU dialect models Intel Xe ISA semantics but works at vector and - TensorDesc data type. It provides 1:1 mappings to match Xe instructions like - DPAS and 2D block load. The matrix size being processed at this level - exactly matches the hardware instructions or the intrinsic supported by - the lower-level GPU compiler. - }]; - - // The C++ namespace that the dialect class definition resides in. - let cppNamespace = "::mlir::xegpu"; - - let dependentDialects = ["::mlir::memref::MemRefDialect"]; + The XeGPU dialect models Intel Xe ISA semantics but works at vector and + TensorDesc data type. It provides 1:1 mappings to match Xe instructions + like DPAS and 2D block load. The matrix size being processed at this level + exactly matches the hardware instructions or the intrinsic supported by + the lower-level GPU compiler. + }]; + + let dependentDialects = [ + "arith::ArithDialect", + "memref::MemRefDialect" + ]; - // TODO: temporary disable it. let useDefaultTypePrinterParser = true; let useDefaultAttributePrinterParser = true; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 6866f903d715e..766590f6a3f87 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -19,35 +19,36 @@ include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" // * The parent dialect of the operation. // * The mnemonic for the operation, or the name without the dialect prefix. // * A list of traits for the operation. -class XeGPU_Op traits = []> : - Op; +class XeGPU_Op traits = []>: + Op; -def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { let summary = "create nd tensor descriptor operation"; let description = [{ The "create_nd_tdesc" operation creates a TensorDescType which represents a sub-view of a 2D memory region (It can be extended to support N-D memory - region if needed in future). Elements in the subview continuous in each dimention. - It encodes the following important information for supporting intel hardware features: - - * source: an object representing (starting address/pointer of) a 2D memory reagion. It can - be either a 2D memref object, or simply a pointer represented by uint64_t type. - * offsets: two index values represents offsets from the "source" at the each dimension at - which the subview of the target memory will be created. It is encoded via two - variables, including "dynamic_offsets" and "static_offsets", such that it can - accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). - * shape: the shape information of the memory region pointed by the "source". It is typically - encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. But if "source" - is simply a pointer represented as uint64_t type, or a memref type without shape information - e.g., memref, the shape information has to be explicitly passed via the "dynamic_shape" - argument. Currently "dynamic_shape" only accepts operands(e.g., [%c4096, %c4096]), - not attributes(e.g., [4096, 4096]). - * strides: the strides of the memory region pointed by the "source". Similar to shape, it is typically - encoded via the MemRefType of the source too. But if "source" is simply a pointer represented - as uint64_t type, or a memref type without shape information e.g., memref, the strides - information has to be explicitly passed via the "dynamic_strides" argument. And it currently - only accepts operands two. + region if needed in future). Elements in the subview continuous in each + dimention. It encodes the following important information for supporting + Intel hardware features: + + * source: an object representing (starting address/pointer of) a 2D memory reagion. + It can be either a 2D memref object, or simply a pointer represented by uint64_t type. + * offsets: two index values represents offsets from the "source" at the each dimension + at which the subview of the target memory will be created. It is encoded via two + variables, including "dynamic_offsets" and "static_offsets", such that it can + accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). + * shape: the shape information of the memory region pointed by the "source". It is + typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. + But if "source" is simply a pointer represented as uint64_t type, or a memref + type without shape information e.g., memref, the shape information has + to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" + only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]). + * strides: the strides of the memory region pointed by the "source". Similar to shape, + it is typically encoded via the MemRefType of the source too. But if "source" is + simply a pointer represented as uint64_t type, or a memref type without shape + information e.g., memref, the strides information has to be explicitly + passed via the "dynamic_strides" argument. And it currently only accepts operands two. Example 1 (suppose the tensor shape inferred by the compiler is 8x16): %0 = memref.alloc() : memref<32x24xf32> @@ -68,30 +69,32 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> }]; - let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $dynamic_offsets, - Variadic: $dynamic_shape, - Variadic: $dynamic_strides, - DenseI64ArrayAttr: $static_offsets, - DefaultValuedAttr: $mode); - + let arguments = (ins XeGPU_BaseAddrType: $source, + Variadic: $dynamic_offsets, + Variadic: $dynamic_shape, + Variadic: $dynamic_strides, + DenseI64ArrayAttr: $static_offsets, + DefaultValuedAttr: $mode); let results = (outs XeGPU_TensorDesc:$TensorDesc); let hasCustomAssemblyFormat = 1; - let skipDefaultBuilders = 1; + let hasVerifier = 1; let builders = [ - OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, - "ValueRange": $shape, "ValueRange": $strides, "::llvm::ArrayRef": $static_offsets, - CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)>, + OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, + "ValueRange": $shape, "ValueRange": $strides, + "llvm::ArrayRef": $static_offsets, + CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>, - OpBuilder<(ins "Type": $tdesc, "Value": $source, "::llvm::ArrayRef": $offsets, - CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)>, + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets, + CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>, - OpBuilder<(ins "Type": $tdesc, "Value": $source, "::llvm::ArrayRef": $offsets, + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets, "ValueRange": $shape, "ValueRange": $stride, - CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)> + CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)> ]; let extraClassDeclaration = [{ @@ -124,7 +127,6 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe /// strides information from memref type will be ignored. llvm::SmallVector getStrides(); - /// return the shape embeded in the memref type of the source. /// If source is not memref type. array of kDynamic will be returned. llvm::ArrayRef getStaticShape(); @@ -133,92 +135,37 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe /// If source is not memref type. array of kDynamic will be returned. llvm::ArrayRef getStaticStrides(); - /// Return the element type of the TensorDesc Type getElementType(); /// Return the shape of the TensorDesc llvm::ArrayRef getTensorDescShape(); - - - }]; - let hasVerifier = 1; -} - -def XeGPU_CreateDescOp - : XeGPU_Op<"create_tdesc", [Pure]> { - - let summary = "create scattered tensor descritors (TensorDesc)."; - let description = [{ - "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates a TensorDesc for a memory region. - while "create_nd_tdesc" is for creating continious subviews, "create_tdesc" is for creating non-continious - (scattered) subviews. It only works with VectorCompute (VC) mode and accepts the following parameters: - - * source: a 1D memref or pointer (uint64_t) represents the memory object. - * offsets: It is a 1D vector containing offsets of each access point, the size should be aligned with - supportted group size, e.g., vector<16xindex>. And each element in the vector corresponds to a - work item (SIMT lane) in the subgroup. - * chunk_size_per_lane: [optional attribute] indicates number of continious elements accessed for each offset, default is 1. - - Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] - %a = memref.alloc() : memref<1024xf32> - %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex> - %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32> - - Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. - It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] - %0 = memref.alloc() : memref<1024xf32> - %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex> - %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> - }]; - - let arguments = (ins XeGPU_BaseAddrType: $source, - XeGPU_OffsetType: $offsets, - DefaultValuedAttr: $chunk_size_per_lane, - DefaultValuedAttr: $mode); - - let results = (outs XeGPU_TensorDesc:$TensorDesc); - - let builders = [ - OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, - "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>, - - OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, - "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)> - ]; - let skipDefaultBuilders = 1; - - // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1} - // : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - let hasCustomAssemblyFormat = 1; - let hasVerifier = 1; } - def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { - let summary = "loads a n-D block from global memory (represented by TensorDesc) to registers (represented by vector)"; + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; let description = [{ - LoadNDOp essentially mimics the hardware block read instruction to read a block of data from memory to register. - It takes a set of cache hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, - Corresponding cache hint attribute will be masked. - - If both transpose and vnni_axis present at the same time. it assume to perform transpose first and then vnni transform. + LoadNDOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of cache hints + for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + If both transpose and vnni_axis present at the same time. It assume to + perform transpose first and then vnni transform. }]; - let arguments = (ins - XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $vnni_axis, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode); + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $vnni_axis, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + OptionalAttr: $transpose, + DefaultValuedAttr: $mode); let results = (outs XeGPU_ValueType: $value); let extraClassDeclaration = [{ - VectorType getValueType() { return llvm::dyn_cast(getValue().getType()); } @@ -226,26 +173,22 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { xegpu::TensorDescType getTensorDescType() { return getTensorDesc().getType(); } - }]; // Format: xegpu.load_nd %1 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming} // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> let hasCustomAssemblyFormat = 1; - let hasVerifier = 1; } def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; - let arguments = (ins - XeGPU_TensorDesc: $TensorDesc, - XeGPU_ValueType: $value, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode - ); + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + DefaultValuedAttr: $mode); // Format: xegpu.store_nd %3, %2 {l1_hint = write_back, l2_hint = uncached} // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> @@ -256,13 +199,12 @@ def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { def XeGPU_PrefetchNDOp : XeGPU_Op<"prefetch_nd", []> { let summary = "prefetches a nD block to cache"; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode - ); + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + DefaultValuedAttr: $mode); - // In format of: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}: + // Format: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}: // !xegpu.tensor_desc<8x16xf16> let hasCustomAssemblyFormat = 1; } @@ -273,85 +215,93 @@ def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> { let arguments = (ins XeGPU_TensorDesc: $TensorDesc, Variadic: $offsets, - DefaultValuedAttr: $mode); + DefaultValuedAttr: $mode); let results = (outs XeGPU_TensorDesc: $result); - let assemblyFormat = [{ - $TensorDesc `,` (`[` $offsets^ `]`)? (`{` `mode` `=` $mode^ `}`)? - attr-dict `:` qualified(type($TensorDesc)) `->` qualified(type($result)) - }]; - + let hasCustomAssemblyFormat = 1; let hasVerifier = 1; } +def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure]> { + let summary = "create scattered tensor descritors (TensorDesc)."; + let description = [{ + "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates + a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" + is for creating continious subviews, "create_tdesc" is for creating non-continious + (scattered) subviews. It is designed only works with VectorCompute (VC) mode and + accepts the following parameters: -def XeGPU_DpasOp : XeGPU_Op<"dpas"> { - let summary = "performs dpas computation"; - let arguments = (ins - XeGPU_DpasOpType : $lhs, - XeGPU_DpasOpType : $rhs, - Optional: $acc, - DefaultValuedAttr: $mode - ); - let results = (outs XeGPU_Vector2DType: $result); - let assemblyFormat = [{ - $lhs `,` $rhs (`,` $acc^)? (` ``{` `mode` `=` $mode^ `}`)? attr-dict `:` - qualified(type($lhs)) `,` qualified(type($rhs)) (`,` qualified(type($acc))^)? `->` qualified(type($result)) - }]; + * source: a 1D memref or pointer (uint64_t) represents the memory object. + * offsets: It is a 1D vector containing offsets of each access point, the supportted + group size, e.g., vector<16xindex>. And each element in the vector corresponds + to a work item (SIMT lane) in the subgroup. + * chunk_size_per_lane: [optional attribute] indicates number of continious elements + accessed for each offset, default is 1. - let extraClassDeclaration = [{ - VectorType getLhsType() { - return ::llvm::cast(getLhs().getType()); - } + Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] + %a = memref.alloc() : memref<1024xf32> + %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex> + %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32> - VectorType getRhsType() { - return ::llvm::cast(getRhs().getType()); - } + Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. + It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] + %0 = memref.alloc() : memref<1024xf32> + %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex> + %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> + }]; - VectorType getAccType() { - return ::llvm::cast(getAcc().getType()); - } + let arguments = (ins XeGPU_BaseAddrType: $source, + XeGPU_OffsetType: $offsets, + DefaultValuedAttr: $chunk_size_per_lane, + DefaultValuedAttr: $mode); + let results = (outs XeGPU_TensorDesc:$TensorDesc); - VectorType getResultType() { return getResult().getType(); } - }]; + let builders = [ + OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, + "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>, + OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, + "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)> + ]; + let skipDefaultBuilders = 1; + // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1} + // : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> + let hasCustomAssemblyFormat = 1; let hasVerifier = 1; } def XeGPU_LoadGatherOp : XeGPU_Op<"load"> { let summary = "load a scalar at source[offset]."; - let arguments = (ins - XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, - OptionalAttr: $vnni_axis, - OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode - ); - + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $vnni_axis, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + DefaultValuedAttr: $mode); let results = (outs XeGPU_ValueType: $value); let builders = [ - OpBuilder<(ins "Type": $value, "Value": $TensorDesc, "Value": $mask, "IntegerAttr": $vnni_axis, - CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l1_hint, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l2_hint, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l3_hint)>, - - OpBuilder<(ins "Type": $value, "Value": $TensorDesc, "Value": $mask, "IntegerAttr": $vnni_axis, - CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l1_hint, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l2_hint, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l3_hint)> - + OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc, + "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis, + CArg<"mlir::DenseI64ArrayAttr", "mlir::DenseI64ArrayAttr()">: $transpose, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>, + + OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc, + "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis, + CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)> ]; let skipDefaultBuilders = 1; - // In format of: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached} + // Format: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached} // : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32> let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -364,22 +314,21 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> { XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, XeGPU_MaskType: $mask, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + DefaultValuedAttr: $mode ); let builders = [ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask, - CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l1_hint, - CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l2_hint, - CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l3_hint)>, - + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>, OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask, - CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l1_hint, - CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l2_hint, - CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l3_hint)> + CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l1_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l2_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l3_hint)> ]; let skipDefaultBuilders = 1; @@ -389,71 +338,86 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> { let hasVerifier = 1; } -def XeGPU_UpdateOffsetOp - : XeGPU_Op<"update_offset", []> { - let summary = "update the offsets for the given tensor descriptor"; - - let arguments = (ins - XeGPU_TensorDesc: $TensorDesc, - XeGPU_OffsetType: $offsets, - DefaultValuedAttr: $mode - ); - - let results = (outs XeGPU_TensorDesc: $result); - - let builders = [ - OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets), [{ - $_state.addOperands(TensorDesc); - $_state.addOperands(offsets); - $_state.getOrAddProperties().mode = xegpu::ModeAttr::get($_builder.getContext(), xegpu::Mode::VC); - $_state.addTypes(result); - }]> - ]; - - let skipDefaultBuilders = 1; - - let assemblyFormat = [{ - $TensorDesc `,` $offsets (`{` `mode` `=` $mode^ `}`)? - attr-dict `:` qualified(type($TensorDesc)) `,` qualified(type($offsets)) `->` qualified(type($result)) - }]; - - let hasVerifier = 1; - } - def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { let summary = "prefetches a nD block to cache"; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint, - DefaultValuedAttr: $mode - ); + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint, + DefaultValuedAttr: $mode); let builders = [ OpBuilder<(ins "Value": $TensorDesc, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l1_hint, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l2_hint, - CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l3_hint)>, - + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint, + CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>, OpBuilder<(ins "Value": $TensorDesc, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l1_hint, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l2_hint, - CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l3_hint)> + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint, + CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)> ]; let skipDefaultBuilders = 1; + let hasVerifier = 1; - // In format of: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}: + // Format: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}: // !xegpu.tensor_desc<8x16xf16> let hasCustomAssemblyFormat = 1; +} + +def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", []> { + let summary = "update the offsets for the given tensor descriptor"; + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + XeGPU_OffsetType: $offsets, + DefaultValuedAttr: $mode); + let results = (outs XeGPU_TensorDesc: $result); + + let builders = [ + OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets)> + ]; + + let skipDefaultBuilders = 1; + let hasCustomAssemblyFormat = 1; + let hasVerifier = 1; +} + +def XeGPU_DpasOp : XeGPU_Op<"dpas"> { + let summary = "performs dpas computation"; + let arguments = (ins + XeGPU_DpasOpType : $lhs, + XeGPU_DpasOpType : $rhs, + Optional: $acc, + DefaultValuedAttr: $mode + ); + let results = (outs XeGPU_Vector2DType: $result); + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + VectorType getLhsType() { + return ::llvm::cast(getLhs().getType()); + } + + VectorType getRhsType() { + return ::llvm::cast(getRhs().getType()); + } + + VectorType getAccType() { + return ::llvm::cast(getAcc().getType()); + } + + VectorType getResultType() { + return getResult().getType(); + } + }]; + let hasVerifier = 1; } def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> { let summary = "Invoke_SIMD operation"; let description = [{ - The `xegpu.invoke_SIMD` operation works similar to a direct call to a function. But it is - special to Intel GPU. + The `xegpu.invoke_SIMD` operation works similar to a direct call to a function. + But it is special to Intel GPU. }]; let arguments = (ins FlatSymbolRefAttr:$callee, @@ -463,15 +427,12 @@ def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> { let builders = [ OpBuilder<(ins "SymbolRefAttr":$callee, "TypeRange":$results, - "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)>, - + "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>, OpBuilder<(ins "StringAttr":$callee, "TypeRange":$results, - "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)>, - + "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>, OpBuilder<(ins "llvm::StringRef":$callee, "TypeRange":$results, - "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)> + "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)> ]; - } def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> { @@ -481,107 +442,64 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> { XeGPU_TensorDesc:$tensorDesc, XeGPU_MaskType:$mask, Optional:$value, - DefaultValuedAttr: $mode + DefaultValuedAttr: $mode ); + let results = (outs XeGPU_ValueType:$result); - let assemblyFormat = [{ - $kind $tensorDesc `,` $mask (`,` $value^)? (`{` `mode` `=` $mode^ `}`)? attr-dict `:` qualified(type(operands)) `->` type($result) - }]; + let hasCustomAssemblyFormat = 1; let builders = [ OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKindAttr": $kind, - "Value": $tensorDesc, "Value": $mask, - "Value": $value)>, - + "Value": $tensorDesc, "Value": $mask, "Value": $value)>, OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKind": $kind, - "Value": $tensorDesc, "Value": $mask, - "Value": $value)> + "Value": $tensorDesc, "Value": $mask, "Value": $value)> ]; let skipDefaultBuilders = 1; - let hasVerifier = 1; } - def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> { - let summary = "allocate a specific number of named barriers."; - let arguments = (ins I32Attr: $nbarrierCount); - let assemblyFormat = "$nbarrierCount attr-dict"; + let summary = "allocate a specific number of named barriers."; + let arguments = (ins I64Attr: $nbarrierCount); + let assemblyFormat = "$nbarrierCount attr-dict"; +} + + +def XeGPU_CreateNbarrierOp: XeGPU_Op<"create_nbarrier", []> { + let summary = "create a named barrier."; + let arguments = (ins I8: $nbarrier_id, + I8: $nbarrier_role, + I8Attr: $num_producers, + I8Attr: $num_consumers, + DefaultValuedAttr: $mode); + let results = (outs XeGPU_Nbarrier: $result); + let hasCustomAssemblyFormat = 1; } +def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> { + let summary = "arrive at a named barrier."; + let arguments = (ins XeGPU_Nbarrier: $payload); + let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload))}]; +} -def XeGPU_CreateNbarrierOp - : XeGPU_Op<"create_nbarrier", []> { - let summary = "create a named barrier."; - - let arguments = (ins - I8: $nbarrier_id, - I8: $nbarrier_role, - I8Attr: $num_producers, - I8Attr: $num_consumers, - DefaultValuedAttr: $mode - ); - - let results = (outs XeGPU_Nbarrier: $result); - - let assemblyFormat = [{ - $nbarrier_id `,` $nbarrier_role - attr-dict `:` `(` qualified(type($nbarrier_id)) `,` qualified(type($nbarrier_role)) `)` - `->` qualified(type($result)) - }]; - - // let hasVerifier = 1; - } - -def XeGPU_NbarrierArriveOp - : XeGPU_Op<"nbarrier_arrive", []> { - let summary = "arrive at a named barrier."; - - let arguments = (ins - XeGPU_Nbarrier: $payload - ); - - let assemblyFormat = [{ - $payload attr-dict `:` qualified(type($payload)) - }]; - } - -def XeGPU_NbarrierWaitOp - : XeGPU_Op<"nbarrier_wait", []> { - let summary = "wait for a named barrier."; - - let arguments = (ins - XeGPU_Nbarrier: $payload - ); - - let assemblyFormat = [{ - $payload attr-dict `:` qualified(type($payload)) - }]; - } - -def XeGPU_CompileHintOp - : XeGPU_Op<"compile_hint", []> { - let summary = "prevents the compiler from scheduling."; - - let assemblyFormat = [{ - attr-dict - }]; - } - -def XeGPU_MfenceOp - : XeGPU_Op<"mfence", []> { - let summary = "lsc fence."; - - let arguments = (ins - StrAttr: $memory_kind, - StrAttr: $fence_op, - StrAttr: $fence_scope - ); - - let assemblyFormat = [{ - attr-dict - }]; - } +def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> { + let summary = "wait for a named barrier."; + let arguments = (ins XeGPU_Nbarrier: $payload); + let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload)) }]; +} + +def XeGPU_CompileHintOp: XeGPU_Op<"compile_hint", []> { + let summary = "prevents the compiler from scheduling."; + let assemblyFormat = [{ attr-dict }]; +} + +def XeGPU_MfenceOp: XeGPU_Op<"mfence", []> { + let summary = "lsc fence."; + let arguments = (ins StrAttr: $memory_kind, + StrAttr: $fence_op, + StrAttr: $fence_scope); + let assemblyFormat = [{ attr-dict }]; +} #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 8d2f1e769c304..b3dceff9587ad 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -15,7 +15,6 @@ include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" // An Integer array attribute with fixed 2 elements. -def XeGPU_IntArrayAttr2: ConfinedAttr]>; def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>; @@ -31,7 +30,7 @@ def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], string baseCppClass = "::mlir::Type"> - : TypeDef { + : TypeDef { let mnemonic = typeMnemonic; } @@ -108,13 +107,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", )>, TypeBuilder<(ins "llvm::ArrayRef": $shape, "mlir::Type": $elementType, - "mlir::xegpu::MemoryScope": $memory_scope, "int": $array_length, + "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length, "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered, "mlir::xegpu::SubGroupMapAttr": $mapping )>, TypeBuilderWithInferredContext<(ins "llvm::ArrayRef": $shape, "mlir::Type": $elementType, - "mlir::xegpu::MemoryScope": $memory_scope, "int": $array_length, + "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length, "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered, "mlir::xegpu::SubGroupMapAttr": $mapping )> @@ -147,7 +146,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", return llvm::dyn_cast_if_present(getEncoding()); } - xegpu::MemoryScope getMemoryScope(); + xegpu::MemoryScopeKind getMemoryScope(); int getArrayLength(); bool getBoundaryCheck(); xegpu::ScatteredAttr getScattered(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 4a2eec6fde163..60ab50227c224 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -76,8 +76,7 @@ mlir::LogicalResult SubGroupMapAttr::verify( mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser, mlir::Type type) { - - mlir::FailureOr memory_scope; + mlir::FailureOr memory_scope; mlir::FailureOr array_length; mlir::FailureOr boundary_check; mlir::FailureOr scattered; @@ -105,7 +104,7 @@ mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser, seen_memory_scope = true; // Parse variable 'memory_scope' memory_scope = - mlir::FieldParser::parse(parser); + mlir::FieldParser::parse(parser); if (mlir::failed(memory_scope)) return parser.emitError( parser.getCurrentLocation(), @@ -157,7 +156,8 @@ mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser, if (parser.parseGreater()) return {}; return TensorDescAttr::get( - parser.getContext(), memory_scope.value_or(xegpu::MemoryScope::GLOBAL), + parser.getContext(), + memory_scope.value_or(xegpu::MemoryScopeKind::GLOBAL), array_length.value_or(1), boundary_check.value_or(true), scattered.value_or(xegpu::ScatteredAttr()), map.value_or(xegpu::SubGroupMapAttr())); @@ -169,7 +169,7 @@ void TensorDescAttr::print(::mlir::AsmPrinter &printer) const { printer << "<"; - if (printDefaults || getMemoryScope() != xegpu::MemoryScope::GLOBAL) { + if (printDefaults || getMemoryScope() != xegpu::MemoryScopeKind::GLOBAL) { if (printSep) printer << ", "; printSep = true; @@ -208,7 +208,7 @@ void TensorDescAttr::print(::mlir::AsmPrinter &printer) const { bool TensorDescAttr::hasNonDefaultAttrs() { int count = 0; - if (getMemoryScope() != MemoryScope::GLOBAL) + if (getMemoryScope() != MemoryScopeKind::GLOBAL) count++; if (getBoundaryCheck() != true) count++; @@ -222,7 +222,7 @@ bool TensorDescAttr::hasNonDefaultAttrs() { } TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context, - xegpu::MemoryScope memory_scope, + xegpu::MemoryScopeKind memory_scope, int array_length, xegpu::ScatteredAttr scattered, xegpu::SubGroupMapAttr map) { @@ -286,12 +286,12 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { if (printDefaultValues()) { auto encoding = getEncoding(); if (auto attr = getEncodingAsMapAttr()) { - encoding = - TensorDescAttr::get(getContext(), MemoryScope::GLOBAL, 1, {}, attr); + encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1, + {}, attr); } if (auto attr = getEncodingAsScatteredAttr()) { - encoding = - TensorDescAttr::get(getContext(), MemoryScope::GLOBAL, 1, attr, {}); + encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1, + attr, {}); } printer << ", " << encoding; } else if (auto encoding = getEncodingAsTensorDescAttr()) { @@ -312,7 +312,7 @@ TensorDescType TensorDescType::get(llvm::ArrayRef shape, TensorDescType TensorDescType::get(mlir::MLIRContext *context, llvm::ArrayRef shape, mlir::Type elementType, - mlir::xegpu::MemoryScope memory_scope, + mlir::xegpu::MemoryScopeKind memory_scope, int array_length, bool boundary_check, mlir::xegpu::ScatteredAttr scattered, mlir::xegpu::SubGroupMapAttr mapping) { @@ -323,7 +323,7 @@ TensorDescType TensorDescType::get(mlir::MLIRContext *context, TensorDescType TensorDescType::get(llvm::ArrayRef shape, mlir::Type elementType, - mlir::xegpu::MemoryScope memory_scope, + mlir::xegpu::MemoryScopeKind memory_scope, int array_length, bool boundary_check, mlir::xegpu::ScatteredAttr scattered, mlir::xegpu::SubGroupMapAttr mapping) { @@ -333,12 +333,12 @@ TensorDescType TensorDescType::get(llvm::ArrayRef shape, return Base::get(elementType.getContext(), shape, elementType, attr); } -xegpu::MemoryScope TensorDescType::getMemoryScope() { +xegpu::MemoryScopeKind TensorDescType::getMemoryScope() { auto attr = getEncodingAsTensorDescAttr(); if (attr) return attr.getMemoryScope(); // return default value - return MemoryScope::GLOBAL; + return MemoryScopeKind::GLOBAL; } int TensorDescType::getArrayLength() { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index a831e3bc2ae5d..627680e84ec94 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -23,27 +23,12 @@ #define DEBUG_TYPE "xegpu" namespace mlir { +class Token; + namespace xegpu { extern bool printDefaultValues(); -static size_t getRankOf(Value value) { - if (value.getType().isIntOrIndexOrFloat()) - return 0; - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - llvm_unreachable("Unsupported value for getRankOf"); -} - -static void transpose(llvm::ArrayRef trans, - std::vector &shape) { - std::vector old = shape; - for (size_t i = 0; i < trans.size(); i++) - shape[i] = old[trans[i]]; -}; - template static std::string makeString(T array, bool breakline = false) { std::string buf; @@ -60,121 +45,21 @@ static std::string makeString(T array, bool breakline = false) { return buf; } -template -static ParseResult parseCustomEnumAttr(OpAsmParser &parser, - OperationState &result, - llvm::StringRef attrKeyword) { - auto loc = parser.getCurrentLocation(); - auto attrOptional = FieldParser::parse(parser); - if (failed(attrOptional)) - return parser.emitError(loc, "invalid attribute specification"); - auto attr = - CustomEnumAttr::get(parser.getBuilder().getContext(), *attrOptional); - result.addAttribute(attrKeyword, attr); - return success(); -} - -template -static ParseResult parseBoolAndIntegerAttr(OpAsmParser &parser, - OperationState &result, - llvm::StringRef attrKeyword) { - AttrType attr; - Type ty; - - if (std::is_same::value) { - ty = parser.getBuilder().getIntegerType(1); - } else if (std::is_same::value) { - ty = parser.getBuilder().getIntegerType(32); - } else if (std::is_same::value) { - ty = Type{}; - } else { - llvm_unreachable("Unsupported Attribute Type."); - } - - if (parser.parseCustomAttributeWithFallback(attr, ty)) - return failure(); - - if (attr) - result.addAttribute(attrKeyword, attr); - return success(); -}; - -/// @brief Parsing optional attribute list which are enclosed in braces "{}", -/// and seperated by comma -/// @param parser -/// @param result -/// @param allowedKeywords -/// @return -static ParseResult -parseOptionalAttrDict(OpAsmParser &parser, OperationState &result, - llvm::ArrayRef allowedKeywords, - bool isWrite = false) { - // no optional attributes, return success - if (failed(parser.parseOptionalLBrace())) - return success(); - - auto parseElt = [&]() -> ParseResult { - auto loc = parser.getCurrentLocation(); - llvm::StringRef nameId; - if (parser.parseOptionalKeyword(&nameId, allowedKeywords)) - return parser.emitError(loc, "invalid attribute keyword: ") - << nameId << ".\n"; - - if (parser.parseEqual()) - return failure(); - - if (nameId == "l1_hint" || nameId == "l2_hint" || nameId == "l3_hint") { - if (isWrite) - return parseCustomEnumAttr( - parser, result, nameId); - else - return parseCustomEnumAttr( - parser, result, nameId); - } - - if (nameId == "mode") { - return parseCustomEnumAttr(parser, result, nameId); - } - - if (nameId == "chunk_size_per_lane" || nameId == "vnni_axis") - return parseBoolAndIntegerAttr(parser, result, nameId); - - if (nameId == "boundary_check") - return parseBoolAndIntegerAttr(parser, result, nameId); - - if (nameId == "transpose") - return parseBoolAndIntegerAttr(parser, result, nameId); - - llvm_unreachable("Unsupported attribute keyword."); - }; - - if (parser.parseCommaSeparatedList(parseElt)) - return failure(); - - return parser.parseRBrace(); +static size_t getRankOf(Value value) { + if (value.getType().isIntOrIndexOrFloat()) + return 0; + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + llvm_unreachable("Unsupported value for getRankOf"); } -template -static void printCacheHintAttrs(OpAsmPrinter &printer, T op, bool printSep) { - if (op.getL1HintAttr()) { - if (printSep) - printer << ", "; - printer << "l1_hint = " << op.getL1Hint().value(); - printSep = true; - } - - if (op.getL2HintAttr()) { - if (printSep) - printer << ", "; - printer << "l2_hint = " << op.getL2Hint().value(); - printSep = true; - } - - if (op.getL3HintAttr()) { - if (printSep) - printer << ", "; - printer << "l3_hint = " << op.getL3Hint().value(); - } +static void transpose(llvm::ArrayRef trans, + std::vector &shape) { + std::vector old = shape; + for (size_t i = 0; i < trans.size(); i++) + shape[i] = old[trans[i]]; } static bool verifyAndInferShape(std::vector &shape, @@ -202,22 +87,94 @@ static bool verifyAndInferShape(std::vector &shape, return true; } -/// @brief the base builder for CreateNdDescOp -/// @param builder, the mlir OpBuilder -/// @param state , the mlir OperationState -/// @param TensorDesc, the TensorDescType of the result -/// @param source, the base address of the data. It can be either 2D memref -/// object or simple integer value (pointer) -/// @param offsets, the dynamic offset given as Value -/// @param shape, the dynamic shape given as array of Values -/// @param strides, the dynamic shape given as array of Values -/// @param static_offsets, the static offset. If it is not used it should be -/// filled with ShapeType::kDynamic -/// @param mode, VC or SIMT +static ParseResult +parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser, + OperationState &result) { + // no optional attributes, return success + if (failed(parser.parseOptionalLBrace())) + return success(); + + llvm::SmallDenseSet seenKeys; + auto parseElt = [&]() -> ParseResult { + // The name of an attribute can either be a keyword, or a string. + // as compared to mlir::parseOptionalAttrList, the cases of using + // TOken::bare_identifier and Token::inttype as key maybe not handlered + std::string nameId; + auto loc = parser.getCurrentLocation(); + if (parser.parseOptionalKeywordOrString(&nameId)) + return parser.emitError(loc, "invalid attribute name: ") + << nameId << ".\n"; + + if (nameId.empty()) + return parser.emitError(loc, "expected valid attribute name"); + + if (!seenKeys.insert(nameId).second) + return parser.emitError(loc, "duplicate key '") + << nameId << "' in dictionary attribute."; + + // Lazy load a dialect in the context if there is a possible namespace. + auto splitName = StringRef(nameId).split('.'); + if (!splitName.second.empty()) + parser.getContext()->getOrLoadDialect(splitName.first); + + // Try to parse the '=' for the attribute value. + if (parser.parseEqual()) { + // If there is no '=', it is treated as a unit attribute. + result.addAttribute(nameId, parser.getBuilder().getUnitAttr()); + return success(); + } + + // for xegpu specific attributes + if (nameId == "mode") { + ModeKindAttr attr; + return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, + result.attributes); + } else if (nameId == "l1_hint" || nameId == "l2_hint" || + nameId == "l3_hint") { + CacheKindAttr attr; + return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, + result.attributes); + } else if (nameId == "transpose") { + // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse() + if (succeeded(parser.parseOptionalLSquare())) { + Attribute attr; + // handle empty list case + if (succeeded(parser.parseOptionalRSquare())) { + attr = DenseI64ArrayAttr::get(parser.getContext(), {}); + } else { + attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{}); + if (failed(parser.parseRSquare())) + return failure(); + } + if (!attr) + return failure(); + result.addAttribute(nameId, attr); + return success(); + } else { + // in form of array + DenseI64ArrayAttr attr; + return parser.parseAttribute(attr, nameId, result.attributes); + } + } else { + Attribute attr; + return parser.parseAttribute(attr, nameId, result.attributes); + } + }; + + if (parser.parseCommaSeparatedList(parseElt)) + return failure(); + + return parser.parseRBrace(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_CreateNdDescOp +//===----------------------------------------------------------------------===// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, Type TensorDesc, Value source, ValueRange offsets, ValueRange shape, ValueRange strides, - llvm::ArrayRef static_offsets, Mode mode) { + llvm::ArrayRef static_offsets, + ModeKind mode) { auto offsetRank = static_offsets.size(); auto shapeRank = shape.size() ? shape.size() : getRankOf(source); @@ -243,13 +200,14 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, state.addAttribute(getStaticOffsetsAttrName(state.name), builder.getDenseI64ArrayAttr(static_offsets)); state.addAttribute(getModeAttrName(state.name), - xegpu::ModeAttr::get(builder.getContext(), mode)); + xegpu::ModeKindAttr::get(builder.getContext(), mode)); state.addTypes(TensorDesc); } void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, Type tdesc, Value source, - llvm::ArrayRef offsets, Mode mode) { + llvm::ArrayRef offsets, + ModeKind mode) { auto ty = llvm::dyn_cast_if_present(source.getType()); assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); @@ -266,8 +224,7 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, Type tdesc, Value source, llvm::ArrayRef offsets, - ValueRange shape, ValueRange stride, - xegpu::Mode mode) { + ValueRange shape, ValueRange stride, ModeKind mode) { assert(shape.size() && offsets.size() && stride.size() && shape.size() == stride.size() && shape.size() == offsets.size()); @@ -283,11 +240,9 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) { // parse the source operand - OpAsmParser::UnresolvedOperand sourceRawOperands[1]; - llvm::ArrayRef sourceOperands( - sourceRawOperands); + llvm::SmallVector sourceOperands(1); llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(sourceRawOperands[0])) + if (parser.parseOperand(sourceOperands[0])) return failure(); // parse the offset operand, in format of [x, y] @@ -328,23 +283,28 @@ ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) { return failure(); } - if (parseOptionalAttrDict(parser, result, {"boundary_check", "mode"})) + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); if (parser.parseColon()) return failure(); - Type sourceRawTypes[1]; - llvm::ArrayRef sourceTypes(sourceRawTypes); - if (parser.parseType(sourceRawTypes[0])) + llvm::SmallVector sourceTypes(1); + if (parser.parseType(sourceTypes[0])) return failure(); if (parser.parseArrow()) return failure(); - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - if (parser.parseType(TensorDescRawTypes[0])) + llvm::SmallVector TensorDescTypes(1); + if (parser.parseType(TensorDescTypes[0])) return failure(); result.addAttribute("operandSegmentSizes", parser.getBuilder().getDenseI32ArrayAttr( @@ -352,11 +312,12 @@ ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) { static_cast(shapeOperands.size()), static_cast(stridesOperands.size())})); - Type indexType = parser.getBuilder().getIndexType(); result.addTypes(TensorDescTypes); if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc, result.operands)) return failure(); + + Type indexType = parser.getBuilder().getIndexType(); if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc, result.operands)) return failure(); @@ -391,11 +352,13 @@ void CreateNdDescOp::print(OpAsmPrinter &printer) { printer << "]"; } - if (printDefaults || mode != Mode::SIMT) { - printer << ' ' << "{"; - printer << "mode = " << mode; - printer << "}"; - } + llvm::SmallVector elidedAttrs; + elidedAttrs.push_back("static_offsets"); + elidedAttrs.push_back("operandSegmentSizes"); + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; @@ -415,12 +378,12 @@ LogicalResult CreateNdDescOp::verify() { "non-scattered operators.\n"); } - if (mode == Mode::VC && mapping) { + if (mode == ModeKind::VC && mapping) { return emitOpError("Mapping attribute of TensorDesc is not expected " "for VC mode operations.\n"); } - if (mode == Mode::SIMT && !mapping) { + if (mode == ModeKind::SIMT && !mapping) { return emitOpError("Expecting SgMap attribute for SIMT mode operators.\n"); } @@ -494,8 +457,8 @@ llvm::SmallVector CreateNdDescOp::getShape() { return shape; } - emitOpError("The shape information is missing."); - llvm_unreachable("Unexpected error in CreateNdDescOp.\n"); + llvm_unreachable("Unexpected error in CreateNdDescOp. " + "The shape information is missing.\n"); } llvm::ArrayRef CreateNdDescOp::getStaticStrides() { @@ -541,199 +504,42 @@ llvm::ArrayRef CreateNdDescOp::getTensorDescShape() { return getTensorDescType().getShape(); } -ParseResult CreateDescOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand sourceRawOperands[1]; - llvm::ArrayRef sourceOperands( - sourceRawOperands); - llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(sourceRawOperands[0])) - return failure(); - - if (parser.parseComma()) - return failure(); - - OpAsmParser::UnresolvedOperand offsetsRawOperands[1]; - llvm::ArrayRef offsetsOperands( - offsetsRawOperands); - llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(offsetsRawOperands[0])) - return failure(); - - if (parseOptionalAttrDict(parser, result, {"chunk_size_per_lane", "mode"})) - return failure(); - - if (parser.parseColon()) - return failure(); - - Type sourceRawTypes[1]; - llvm::ArrayRef sourceTypes(sourceRawTypes); - if (parser.parseType(sourceRawTypes[0])) - return failure(); - if (parser.parseComma()) - return failure(); - - Type offsetsRawTypes[1]; - llvm::ArrayRef offsetsTypes(offsetsRawTypes); - if (parser.parseType(offsetsRawTypes[0])) - return failure(); - if (parser.parseArrow()) - return failure(); - - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - if (parser.parseType(TensorDescRawTypes[0])) - return failure(); +//===----------------------------------------------------------------------===// +// XeGPU_LoadNDOp +//===----------------------------------------------------------------------===// - result.addTypes(TensorDescTypes); - if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc, - result.operands)) - return failure(); - if (parser.resolveOperands(offsetsOperands, offsetsTypes, offsetsOperandsLoc, - result.operands)) +ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands(1); + llvm::SMLoc OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperand(Operands[0])) return failure(); - return success(); -} - -void CreateDescOp::print(OpAsmPrinter &printer) { - auto mode = getMode(); - bool printSep = false; - auto chunk = getChunkSizePerLane(); - auto printDefaults = printDefaultValues(); - - printer << ' '; - printer << getSource(); - printer << ","; - printer << ' '; - printer << getOffsets(); - - if (printDefaults || mode != Mode::SIMT || chunk != 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << mode; - printSep = true; - } - - if (printDefaults || chunk != 1) { - if (printSep) - printer << "," << ' '; - printer << "chunk_size_per_lane = " << chunk; - } - - if (printDefaults || mode != Mode::SIMT || chunk != 1) { - printer << "}"; - } - - printer << ' ' << ":"; - printer << ' '; - printer << getSource().getType(); - printer << ","; - printer << ' '; - printer << getOffsets().getType(); - printer << ' ' << "->"; - printer << ' '; - printer << getTensorDesc().getType(); -} - -LogicalResult CreateDescOp::verify() { - auto mode = getMode(); - auto mapping = getTensorDesc().getType().getMapping(); - auto offsetTy = getOffsets().getType(); - auto tdescTy = getTensorDesc().getType(); - auto chunkSize = getChunkSizePerLane(); - - if (mode == Mode::SIMT || mapping) { - return emitOpError("CreateDescOp only support VC mode and mapping " - "attribute of TensorDesc is not expected.\n"); - } - - if (getRankOf(getSource()) > 2) - return emitOpError( - "Expecting the source is a 1D/2D memref or pointer (uint64_t)."); - - if (!tdescTy.getScattered()) - return emitOpError( - "Expecting the presence of ScatteredAttr for tensor descriptor."); - // Infer the TensorDesc shape - std::vector shape; - if (llvm::isa(offsetTy)) { - shape = llvm::dyn_cast(offsetTy).getShape().vec(); - if (shape.size() != 1) - return emitOpError("Expecting the offset is a 1D vector."); - } - - if (chunkSize != 1) { - shape.push_back(chunkSize); - } - - auto tdescShape = tdescTy.getShape(); - if (shape != tdescShape.vec()) { - return emitOpError("Expecting dimensions of offsets is the same as the " - "tensor descriptor, or one less than."); - } - - return success(); -} - -void CreateDescOp::build(OpBuilder &builder, OperationState &state, - TensorDescType TensorDesc, Value source, Value offsets, - uint32_t chunk_size_per_lane) { - state.addOperands(source); - state.addOperands(offsets); - state.getOrAddProperties().chunk_size_per_lane = - builder.getIntegerAttr(builder.getIntegerType(32), chunk_size_per_lane); - state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); - state.addTypes(TensorDesc); -} - -void CreateDescOp::build(OpBuilder &builder, OperationState &state, - TensorDescType TensorDesc, Value source, Value offsets, - IntegerAttr chunk_size_per_lane) { - state.addOperands(source); - state.addOperands(offsets); - if (chunk_size_per_lane) - state.getOrAddProperties().chunk_size_per_lane = - chunk_size_per_lane; - state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); - state.addTypes(TensorDesc); -} - -ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); - llvm::SMLoc TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) return failure(); - if (parseOptionalAttrDict( - parser, result, - {"mode", "vnni_axis", "transpose", "l1_hint", "l2_hint", "l3_hint"})) + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); if (parser.parseColon()) return failure(); - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - if (parser.parseType(TensorDescRawTypes[0])) + llvm::SmallVector Types(1); + if (parser.parseType(Types[0])) return failure(); if (parser.parseArrow()) return failure(); - Type valueRawTypes[1]; - llvm::ArrayRef valueTypes(valueRawTypes); - if (parser.parseType(valueRawTypes[0])) + llvm::SmallVector valueTypes(1); + if (parser.parseType(valueTypes[0])) return failure(); result.addTypes(valueTypes); - if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, - TensorDescOperandsLoc, result.operands)) + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) return failure(); return success(); @@ -741,42 +547,16 @@ ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) { void LoadNDOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); printer << ' '; printer << getTensorDesc(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << mode; - printSep = true; - } - - if (getVnniAxisAttr()) { - if (printSep) - printer << "," << ' '; - printer << "vnni_axis = " << getVnniAxis().value(); - printSep = true; - } - - if (getTransposeAttr()) { - if (printSep) - printer << "," << ' '; - printer << "transpose = "; - getTransposeAttr().print(printer); - printSep = true; - } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); - printCacheHintAttrs(printer, *this, printSep); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; - } + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; @@ -809,7 +589,7 @@ LogicalResult LoadNDOp::verify() { auto valueShape = valueTy.getShape().vec(); auto array_len = tdescTy.getArrayLength(); - if (mode == Mode::SIMT) { + if (mode == ModeKind::SIMT) { auto sgMap = tdescTy.getMapping(); if (!sgMap) { return emitOpError( @@ -864,50 +644,42 @@ LogicalResult LoadNDOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// XeGPU_StoreNDOp +//===----------------------------------------------------------------------===// ParseResult StoreNDOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand valueRawOperands[1]; - llvm::ArrayRef valueOperands( - valueRawOperands); - llvm::SMLoc valueOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(valueRawOperands[0])) + llvm::SmallVector Operands(2); + llvm::SMLoc OperandsLoc = parser.getCurrentLocation(); + // parse value + if (parser.parseOperand(Operands[0])) return failure(); if (parser.parseComma()) return failure(); - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); - llvm::SMLoc TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) + // parse TensorDesc + if (parser.parseOperand(Operands[1])) return failure(); - if (parseOptionalAttrDict(parser, result, - {"mode", "l1_hint", "l2_hint", "l3_hint"}, true)) + // parse optional attributes + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) return failure(); - if (parser.parseColon()) + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); - Type valueRawTypes[1]; - llvm::ArrayRef valueTypes(valueRawTypes); - if (parser.parseType(valueRawTypes[0])) + if (parser.parseColon()) return failure(); - if (parser.parseComma()) + llvm::SmallVector Types; + if (parser.parseTypeList(Types)) return failure(); - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - if (parser.parseType(TensorDescRawTypes[0])) - return failure(); - - if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, - TensorDescOperandsLoc, result.operands)) - return failure(); - - if (parser.resolveOperands(valueOperands, valueTypes, valueOperandsLoc, - result.operands)) + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) return failure(); return success(); @@ -915,9 +687,7 @@ ParseResult StoreNDOp::parse(OpAsmParser &parser, OperationState &result) { void StoreNDOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - [[maybe_unused]] bool printSep = false; auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); printer << ' '; printer << getValue(); @@ -925,20 +695,10 @@ void StoreNDOp::print(OpAsmPrinter &printer) { printer << ' '; printer << getTensorDesc(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << getMode(); - printSep = true; - } - - printCacheHintAttrs(printer, *this, true); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; - } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; @@ -969,7 +729,7 @@ LogicalResult StoreNDOp::verify() { auto mode = getMode(); - if (mode == Mode::VC) { // for VC mode, no attr attached + if (mode == ModeKind::VC) { // for VC mode, no attr attached if (dstTy.getShape() != valTy.getShape()) return emitOpError("In VC mode, the value (vector) shape doesn't match " "the memory (dst) shape.\n"); @@ -1004,26 +764,32 @@ LogicalResult StoreNDOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// XeGPU_PrefetchNDOp +//===----------------------------------------------------------------------===// ParseResult PrefetchNDOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); + llvm::SmallVector TensorDescOperands(1); + llvm::SmallVector TensorDescTypes(1); llvm::SMLoc TensorDescOperandsLoc; - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) + if (parser.parseOperand(TensorDescOperands[0])) + return failure(); + + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) return failure(); - if (parseOptionalAttrDict(parser, result, - {"mode", "l1_hint", "l2_hint", "l3_hint"})) + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); if (parser.parseColon()) return failure(); - if (parser.parseType(TensorDescRawTypes[0])) + if (parser.parseType(TensorDescTypes[0])) return failure(); if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, TensorDescOperandsLoc, result.operands)) @@ -1033,255 +799,270 @@ ParseResult PrefetchNDOp::parse(OpAsmParser &parser, OperationState &result) { void PrefetchNDOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - [[maybe_unused]] bool printSep = false; auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); + printer << ' '; printer << getTensorDesc(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << getMode(); - printSep = true; - } - - printCacheHintAttrs(printer, *this, true); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; - } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; printer << getTensorDesc().getType(); } -LogicalResult DpasOp::verify() { +//===----------------------------------------------------------------------===// +// XeGPU_UpdateNDOffsetOp +//===----------------------------------------------------------------------===// +ParseResult UpdateNDOffsetOp::parse(OpAsmParser &parser, + OperationState &result) { + llvm::SmallVector TensorDescOperands(1); + llvm::SmallVector offsetsOperands; + llvm::SmallVector TensorDescTypes(1); + llvm::SmallVector resultTypes(1); + llvm::SMLoc TensorDescOperandsLoc; + llvm::SMLoc offsetsOperandsLoc; - int64_t lhsRank = getLhsType().getRank(); - int64_t rhsRank = getRhsType().getRank(); - Type lhsElemType = getLhsType().getElementType(); - Type rhsElemType = getRhsType().getElementType(); + TensorDescOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperand(TensorDescOperands[0])) + return failure(); + if (parser.parseComma()) + return failure(); - if (lhsElemType != rhsElemType) { - return emitOpError("lhs and rhs element type does not match for dpas op"); + // parse offsets, e.g., [x, y] + if (succeeded(parser.parseOptionalLSquare())) { + offsetsOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(offsetsOperands)) + return failure(); + if (parser.parseRSquare()) + return failure(); } - if (getAcc() && getAccType() != getResultType()) { - return emitOpError("Accumulator and Result for dpas op should have the " - "same type (both shape and element type)."); - } + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); - if (lhsRank != rhsRank || lhsRank != 3) { - return emitOpError( - "lhs and rhs rank does not match for dpas op, or their rank is not 3."); + auto loc = parser.getCurrentLocation(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + if (parser.parseType(TensorDescTypes[0])) + return failure(); + if (parser.parseArrow()) + return failure(); + + if (parser.parseType(resultTypes[0])) + return failure(); + result.addTypes(resultTypes); + if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, + TensorDescOperandsLoc, result.operands)) + return failure(); + + Type indexType = parser.getBuilder().getIndexType(); + if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc, + result.operands)) + return failure(); + return success(); +} + +void UpdateNDOffsetOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + + printer << ' '; + printer << getTensorDesc(); + printer << ","; + if (!getOffsets().empty()) { + printer << ' ' << "["; + printer << getOffsets(); + printer << "]"; } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + + printer << ' ' << ":"; + printer << ' '; + printer << getTensorDesc().getType(); + printer << ' ' << "->"; + printer << ' '; + printer << getResult().getType(); +} + +LogicalResult UpdateNDOffsetOp::verify() { + // number of offsets specified must match the rank of the tensor descriptor + if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) { + return emitOpError("Invalid number of offsets."); + } return success(); } -ParseResult LoadGatherOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); - llvm::SMLoc TensorDescOperandsLoc; - OpAsmParser::UnresolvedOperand maskRawOperands[1]; - llvm::ArrayRef maskOperands(maskRawOperands); - llvm::SMLoc maskOperandsLoc; +//===----------------------------------------------------------------------===// +// XeGPU_CreateDescOp +//===----------------------------------------------------------------------===// +void CreateDescOp::build(OpBuilder &builder, OperationState &state, + TensorDescType TensorDesc, Value source, Value offsets, + uint32_t chunk_size_per_lane) { + state.addOperands(source); + state.addOperands(offsets); + state.getOrAddProperties().chunk_size_per_lane = + builder.getIntegerAttr(builder.getIntegerType(32), chunk_size_per_lane); + state.getOrAddProperties().mode = + ModeKindAttr::get(builder.getContext(), ModeKind::VC); + state.addTypes(TensorDesc); +} - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - Type maskRawTypes[1]; - llvm::ArrayRef maskTypes(maskRawTypes); - Type valueRawTypes[1]; - llvm::ArrayRef valueTypes(valueRawTypes); +void CreateDescOp::build(OpBuilder &builder, OperationState &state, + TensorDescType TensorDesc, Value source, Value offsets, + IntegerAttr chunk_size_per_lane) { + state.addOperands(source); + state.addOperands(offsets); + if (chunk_size_per_lane) + state.getOrAddProperties().chunk_size_per_lane = + chunk_size_per_lane; + state.getOrAddProperties().mode = + ModeKindAttr::get(builder.getContext(), ModeKind::VC); + state.addTypes(TensorDesc); +} - TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) +ParseResult CreateDescOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands(2); + llvm::SmallVector Types(2); + llvm::SMLoc operandsLoc = parser.getCurrentLocation(); + // parse the source operand + if (parser.parseOperand(Operands[0])) return failure(); if (parser.parseComma()) return failure(); - maskOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(maskRawOperands[0])) + // parse the offset operand + if (parser.parseOperand(Operands[1])) + return failure(); + + // parse the optional attributes + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) return failure(); - if (parseOptionalAttrDict( - parser, result, - {"mode", "vnni_axis", "transpose", "l1_hint", "l2_hint", "l3_hint"})) + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); if (parser.parseColon()) return failure(); - if (parser.parseType(TensorDescRawTypes[0])) + if (parser.parseType(Types[0])) return failure(); - if (parser.parseComma()) return failure(); - if (parser.parseType(maskRawTypes[0])) + if (parser.parseType(Types[1])) return failure(); - if (parser.parseArrow()) return failure(); - if (parser.parseType(valueRawTypes[0])) - return failure(); - - result.addTypes(valueTypes); - - if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, - TensorDescOperandsLoc, result.operands)) + llvm::SmallVector TensorDescTypes(1); + if (parser.parseType(TensorDescTypes[0])) return failure(); - if (parser.resolveOperands(maskOperands, maskTypes, maskOperandsLoc, - result.operands)) + result.addTypes(TensorDescTypes); + if (parser.resolveOperands(Operands, Types, operandsLoc, result.operands)) return failure(); return success(); } -void LoadGatherOp::print(OpAsmPrinter &printer) { +void CreateDescOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; + auto chunk = getChunkSizePerLane(); auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); printer << ' '; - printer << getTensorDesc(); + printer << getSource(); printer << ","; printer << ' '; - printer << getMask(); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << getMode(); - printSep = true; - } - - if (getVnniAxisAttr()) { - if (printSep) - printer << "," << ' '; - printer << "vnni_axis = " << getVnniAxis().value(); - printSep = true; - } - - if (getTransposeAttr()) { - if (printSep) - printer << "," << ' '; - printer << "transpose = "; - getTransposeAttr().print(printer); - printSep = true; - } - - printCacheHintAttrs(printer, *this, printSep); + printer << getOffsets(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; + llvm::SmallVector elidedAttrs; + if (!printDefaults) { + if (mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + if (chunk == 1) + elidedAttrs.push_back("chunk_size_per_lane"); } + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; - printer << getTensorDesc().getType(); + printer << getSource().getType(); printer << ","; printer << ' '; - printer << getMask().getType(); + printer << getOffsets().getType(); printer << ' ' << "->"; printer << ' '; - printer << getValue().getType(); + printer << getTensorDesc().getType(); } -LogicalResult LoadGatherOp::verify() { +LogicalResult CreateDescOp::verify() { + auto mode = getMode(); + auto mapping = getTensorDesc().getType().getMapping(); + auto offsetTy = getOffsets().getType(); auto tdescTy = getTensorDesc().getType(); - auto maskTy = getMask().getType(); - auto valueTy = getValue().getType(); - - if (!tdescTy.getScattered()) - return emitOpError( - "LoadGatherOp only works on TensorDesc with ScatteredAttr."); + auto chunkSize = getChunkSizePerLane(); - auto getElementType = [&](Type type) -> Type { - if (type.isIntOrIndexOrFloat()) - return type; - else if (llvm::isa(type)) - return llvm::dyn_cast(type).getElementType(); - else if (llvm::isa(type)) - return llvm::dyn_cast(type).getElementType(); - llvm_unreachable("Unsupported type."); - return type; - }; + if (mode == ModeKind::SIMT || mapping) { + return emitOpError("CreateDescOp only support VC mode and mapping " + "attribute of TensorDesc is not expected.\n"); + } - auto tdescElemTy = getElementType(tdescTy); - auto valueElemTy = getElementType(valueTy); - if (tdescElemTy != valueElemTy) + if (getRankOf(getSource()) > 2) return emitOpError( - "Value should have the same element type as TensorDesc."); - - auto getShape = [&](Type type) -> std::vector { - std::vector shape; - if (type.isIntOrIndexOrFloat()) - shape.push_back(1); - else if (llvm::isa(type)) - shape = llvm::dyn_cast(type).getShape().vec(); - else - llvm_unreachable("Unsupported type."); - return shape; - }; - - std::vector maskShape = getShape(maskTy); - std::vector valueShape = getShape(valueTy); - std::vector tdescShape = tdescTy.getShape().vec(); + "Expecting the source is a 1D/2D memref or pointer (uint64_t)."); - if (tdescShape != maskShape) - return emitOpError("Mask should have the same shape as TensorDesc."); + if (!tdescTy.getScattered()) + return emitOpError( + "Expecting the presence of ScatteredAttr for tensor descriptor."); - auto mode = getMode(); - auto mapping = tdescTy.getMapping(); - if (mode == Mode::SIMT || mapping) { - return emitOpError("LoadGatherOp only supports VC mode and mapping " - "attribute of TensorDesc is not expected.\n"); + // Infer the TensorDesc shape + std::vector shape; + if (llvm::isa(offsetTy)) { + shape = llvm::dyn_cast(offsetTy).getShape().vec(); + if (shape.size() != 1) + return emitOpError("Expecting the offset is a 1D vector."); } - if (getTranspose()) { - auto trans = getTranspose().value(); - if (tdescShape.size() >= trans.size()) - transpose(trans, tdescShape); - else - emitWarning("Invalid transpose attr. It is ignored."); + if (chunkSize != 1) { + shape.push_back(chunkSize); } - if (getVnniAxis()) { - auto axis = getVnniAxis().value(); - auto vnni_factor = valueShape.back(); - tdescShape[axis] /= vnni_factor; - tdescShape.push_back(vnni_factor); + auto tdescShape = tdescTy.getShape(); + if (shape != tdescShape.vec()) { + return emitOpError("Expecting dimensions of offsets is the same as the " + "tensor descriptor, or one less than."); } - if (valueShape != tdescShape) - return emitOpError( - "Result shape doesn't match TensorDesc shape. when VNNI is not enabled," - "the result should have the same shape (or transposed shape if " - "transpose is also enabled) as TensorDesc. When VNNI is enabled, " - "the result should have one more dimention than the TensorDesc, " - "with last dimention having vnni factor, but having same number of" - "total data elements. The vnni factor are typically calculated as " - "simd_lane_width/elementTypeBitWidth. For element type having " - "more than 32 bits, vnni shouldn't be used.\n"); - return success(); } +//===----------------------------------------------------------------------===// +// XeGPU_LoadGatherOp +//===----------------------------------------------------------------------===// void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value, Value TensorDesc, Value mask, IntegerAttr vnni_axis, - DenseI64ArrayAttr transpose, CacheReadHintAttr l1_hint, - CacheReadHintAttr l2_hint, CacheReadHintAttr l3_hint) { + DenseI64ArrayAttr transpose, CacheKindAttr l1_hint, + CacheKindAttr l2_hint, CacheKindAttr l3_hint) { state.addOperands(TensorDesc); state.addOperands(mask); if (vnni_axis) @@ -1300,14 +1081,14 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value, state.getOrAddProperties().l3_hint = l3_hint; state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); state.addTypes(value); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value, Value TensorDesc, Value mask, IntegerAttr vnni_axis, - DenseI64ArrayAttr transpose, CacheReadHint l1_hint, - CacheReadHint l2_hint, CacheReadHint l3_hint) { + DenseI64ArrayAttr transpose, CacheKind l1_hint, + CacheKind l2_hint, CacheKind l3_hint) { state.addOperands(TensorDesc); state.addOperands(mask); if (vnni_axis) @@ -1317,99 +1098,244 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value, state.getOrAddProperties().transpose = transpose; state.getOrAddProperties().l1_hint = - CacheReadHintAttr::get(builder.getContext(), l1_hint); + CacheKindAttr::get(builder.getContext(), l1_hint); state.getOrAddProperties().l2_hint = - CacheReadHintAttr::get(builder.getContext(), l2_hint); + CacheKindAttr::get(builder.getContext(), l2_hint); state.getOrAddProperties().l3_hint = - CacheReadHintAttr::get(builder.getContext(), l3_hint); + CacheKindAttr::get(builder.getContext(), l3_hint); state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); state.addTypes(value); } -ParseResult StoreScatterOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); - llvm::SMLoc TensorDescOperandsLoc; - - OpAsmParser::UnresolvedOperand valueRawOperands[1]; - llvm::ArrayRef valueOperands( - valueRawOperands); - llvm::SMLoc valueOperandsLoc; - - OpAsmParser::UnresolvedOperand maskRawOperands[1]; - llvm::ArrayRef maskOperands(maskRawOperands); - llvm::SMLoc maskOperandsLoc; - - Type valueRawTypes[1]; - llvm::ArrayRef valueTypes(valueRawTypes); - - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); - - Type maskRawTypes[1]; - llvm::ArrayRef maskTypes(maskRawTypes); +ParseResult LoadGatherOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands(2); + llvm::SmallVector Types(2); + llvm::SmallVector valueTypes(1); + llvm::SMLoc OperandsLoc; - valueOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(valueRawOperands[0])) + OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperand(Operands[0])) return failure(); if (parser.parseComma()) return failure(); - TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) + if (parser.parseOperand(Operands[1])) return failure(); - if (parser.parseComma()) + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); - maskOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(maskRawOperands[0])) + if (parser.parseColon()) return failure(); - if (parseOptionalAttrDict(parser, result, - {"mode", "l1_hint", "l2_hint", "l3_hint"}, true)) + if (parser.parseType(Types[0])) return failure(); - if (parser.parseColon()) + if (parser.parseComma()) return failure(); - if (parser.parseType(valueRawTypes[0])) + if (parser.parseType(Types[1])) return failure(); - if (parser.parseComma()) + if (parser.parseArrow()) return failure(); - if (parser.parseType(TensorDescRawTypes[0])) + if (parser.parseType(valueTypes[0])) return failure(); - if (parser.parseComma()) + result.addTypes(valueTypes); + + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) return failure(); - if (parser.parseType(maskRawTypes[0])) + return success(); +} + +void LoadGatherOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + + printer << ' '; + printer << getTensorDesc(); + printer << ","; + printer << ' '; + printer << getMask(); + + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + + printer << ' ' << ":"; + printer << ' '; + printer << getTensorDesc().getType(); + printer << ","; + printer << ' '; + printer << getMask().getType(); + printer << ' ' << "->"; + printer << ' '; + printer << getValue().getType(); +} + +LogicalResult LoadGatherOp::verify() { + auto tdescTy = getTensorDesc().getType(); + auto maskTy = getMask().getType(); + auto valueTy = getValue().getType(); + + if (!tdescTy.getScattered()) + return emitOpError( + "LoadGatherOp only works on TensorDesc with ScatteredAttr."); + + auto getElementType = [&](Type type) -> Type { + if (type.isIntOrIndexOrFloat()) + return type; + else if (llvm::isa(type)) + return llvm::dyn_cast(type).getElementType(); + else if (llvm::isa(type)) + return llvm::dyn_cast(type).getElementType(); + llvm_unreachable("Unsupported type."); + return type; + }; + + auto tdescElemTy = getElementType(tdescTy); + auto valueElemTy = getElementType(valueTy); + if (tdescElemTy != valueElemTy) + return emitOpError( + "Value should have the same element type as TensorDesc."); + + auto getShape = [&](Type type) -> std::vector { + std::vector shape; + if (type.isIntOrIndexOrFloat()) + shape.push_back(1); + else if (llvm::isa(type)) + shape = llvm::dyn_cast(type).getShape().vec(); + else + llvm_unreachable("Unsupported type."); + return shape; + }; + + std::vector maskShape = getShape(maskTy); + std::vector valueShape = getShape(valueTy); + std::vector tdescShape = tdescTy.getShape().vec(); + + if (tdescShape != maskShape) + return emitOpError("Mask should have the same shape as TensorDesc."); + + auto mode = getMode(); + auto mapping = tdescTy.getMapping(); + if (mode == ModeKind::SIMT || mapping) { + return emitOpError("LoadGatherOp only supports VC mode and mapping " + "attribute of TensorDesc is not expected.\n"); + } + + if (getTransposeAttr()) { + auto trans = getTranspose().value(); + if (tdescShape.size() < trans.size()) + return emitWarning("Invalid transpose attr. It is ignored."); + transpose(trans, tdescShape); + } + + if (getVnniAxis()) { + auto axis = getVnniAxis().value(); + auto vnni_factor = valueShape.back(); + tdescShape[axis] /= vnni_factor; + tdescShape.push_back(vnni_factor); + } + + if (valueShape != tdescShape) + return emitOpError( + "Result shape doesn't match TensorDesc shape. when VNNI is not enabled," + "the result should have the same shape (or transposed shape if " + "transpose is also enabled) as TensorDesc. When VNNI is enabled, " + "the result should have one more dimention than the TensorDesc, " + "with last dimention having vnni factor, but having same number of" + "total data elements. The vnni factor are typically calculated as " + "simd_lane_width/elementTypeBitWidth. For element type having " + "more than 32 bits, vnni shouldn't be used.\n"); + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreScatterOp +//===----------------------------------------------------------------------===// +void StoreScatterOp::build(OpBuilder &builder, OperationState &state, + Value value, Value TensorDesc, Value mask, + CacheKindAttr l1_hint, CacheKindAttr l2_hint, + CacheKindAttr l3_hint) { + state.addOperands(value); + state.addOperands(TensorDesc); + state.addOperands(mask); + if (l1_hint) + state.getOrAddProperties().l1_hint = l1_hint; + if (l2_hint) + state.getOrAddProperties().l2_hint = l2_hint; + if (l3_hint) + state.getOrAddProperties().l3_hint = l3_hint; + state.getOrAddProperties().mode = + ModeKindAttr::get(builder.getContext(), ModeKind::VC); +} + +void StoreScatterOp::build(OpBuilder &builder, OperationState &state, + Value value, Value TensorDesc, Value mask, + CacheKind l1_hint, CacheKind l2_hint, + CacheKind l3_hint) { + state.addOperands(value); + state.addOperands(TensorDesc); + state.addOperands(mask); + state.getOrAddProperties().l1_hint = + CacheKindAttr::get(builder.getContext(), l1_hint); + state.getOrAddProperties().l2_hint = + CacheKindAttr::get(builder.getContext(), l2_hint); + ; + state.getOrAddProperties().l3_hint = + CacheKindAttr::get(builder.getContext(), l3_hint); + ; + state.getOrAddProperties().mode = + ModeKindAttr::get(builder.getContext(), ModeKind::VC); +} + +ParseResult StoreScatterOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands; + llvm::SmallVector Types; + llvm::SMLoc OperandsLoc; + + OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(Operands)) return failure(); - if (parser.resolveOperands(valueOperands, valueTypes, valueOperandsLoc, - result.operands)) + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); - if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, - TensorDescOperandsLoc, result.operands)) + if (parser.parseColon()) return failure(); - if (parser.resolveOperands(maskOperands, maskTypes, maskOperandsLoc, - result.operands)) + if (parser.parseTypeList(Types)) + return failure(); + + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) return failure(); + return success(); } void StoreScatterOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); printer << ' '; printer << getValue(); @@ -1420,20 +1346,10 @@ void StoreScatterOp::print(OpAsmPrinter &printer) { printer << ' '; printer << getMask(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << getMode(); - printSep = true; - } - - printCacheHintAttrs(printer, *this, printSep); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; - } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; @@ -1453,7 +1369,7 @@ LogicalResult StoreScatterOp::verify() { auto mode = getMode(); auto mapping = tdescTy.getMapping(); - if (mode != Mode::VC || mapping) + if (mode != ModeKind::VC || mapping) return emitOpError("StoreScatterOp only supports VC mode and mapping " "attribute of TensorDesc is not expected.\n"); @@ -1490,67 +1406,64 @@ LogicalResult StoreScatterOp::verify() { return success(); } -void StoreScatterOp::build(OpBuilder &builder, OperationState &state, - Value value, Value TensorDesc, Value mask, - CacheWriteHintAttr l1_hint, - CacheWriteHintAttr l2_hint, - CacheWriteHintAttr l3_hint) { - state.addOperands(value); +//===----------------------------------------------------------------------===// +// XeGPU_PrefetchOp +//===----------------------------------------------------------------------===// +void PrefetchOp::build(OpBuilder &builder, OperationState &state, + Value TensorDesc, CacheKindAttr l1_hint, + CacheKindAttr l2_hint, CacheKindAttr l3_hint) { state.addOperands(TensorDesc); - state.addOperands(mask); - if (l1_hint) { + if (l1_hint) state.getOrAddProperties().l1_hint = l1_hint; - } - if (l2_hint) { + + if (l2_hint) state.getOrAddProperties().l2_hint = l2_hint; - } - if (l3_hint) { + + if (l3_hint) state.getOrAddProperties().l3_hint = l3_hint; - } + state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); } -void StoreScatterOp::build(OpBuilder &builder, OperationState &state, - Value value, Value TensorDesc, Value mask, - CacheWriteHint l1_hint, CacheWriteHint l2_hint, - CacheWriteHint l3_hint) { - state.addOperands(value); +void PrefetchOp::build(OpBuilder &builder, OperationState &state, + Value TensorDesc, CacheKind l1_hint, CacheKind l2_hint, + CacheKind l3_hint) { state.addOperands(TensorDesc); - state.addOperands(mask); state.getOrAddProperties().l1_hint = - CacheWriteHintAttr::get(builder.getContext(), l1_hint); + CacheKindAttr::get(builder.getContext(), l1_hint); state.getOrAddProperties().l2_hint = - CacheWriteHintAttr::get(builder.getContext(), l2_hint); - ; + CacheKindAttr::get(builder.getContext(), l2_hint); state.getOrAddProperties().l3_hint = - CacheWriteHintAttr::get(builder.getContext(), l3_hint); - ; + CacheKindAttr::get(builder.getContext(), l3_hint); state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); } ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::UnresolvedOperand TensorDescRawOperands[1]; - llvm::ArrayRef TensorDescOperands( - TensorDescRawOperands); + llvm::SmallVector TensorDescOperands(1); + llvm::SmallVector TensorDescTypes(1); llvm::SMLoc TensorDescOperandsLoc; - Type TensorDescRawTypes[1]; - llvm::ArrayRef TensorDescTypes(TensorDescRawTypes); TensorDescOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(TensorDescRawOperands[0])) + if (parser.parseOperand(TensorDescOperands[0])) return failure(); - if (parseOptionalAttrDict(parser, result, - {"mode", "l1_hint", "l2_hint", "l3_hint"})) + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) return failure(); if (parser.parseColon()) return failure(); - if (parser.parseType(TensorDescRawTypes[0])) + if (parser.parseType(TensorDescTypes[0])) return failure(); + if (parser.resolveOperands(TensorDescOperands, TensorDescTypes, TensorDescOperandsLoc, result.operands)) return failure(); @@ -1559,27 +1472,15 @@ ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) { void PrefetchOp::print(OpAsmPrinter &printer) { auto mode = getMode(); - bool printSep = false; auto printDefaults = printDefaultValues(); - auto numAttrs = (*this)->getAttrs().size(); printer << ' '; printer << getTensorDesc(); - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << ' ' << "{"; - } - - if (printDefaults || mode != Mode::SIMT) { - printer << "mode = " << getMode(); - printSep = true; - } - - printCacheHintAttrs(printer, *this, printSep); - - if (printDefaults || mode != Mode::SIMT || numAttrs > 1) { - printer << "}"; - } + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); printer << ' ' << ":"; printer << ' '; @@ -1591,89 +1492,235 @@ LogicalResult PrefetchOp::verify() { auto tdescTy = getTensorDesc().getType(); auto mapping = tdescTy.getMapping(); - if (tdescTy.getScattered()) + auto isValidHint = [&](CacheKindAttr attr) -> bool { + if (!attr) + return true; + auto kind = attr.getValue(); + return kind == CacheKind::CACHED || kind == CacheKind::UNCACHED || + kind == CacheKind::STREAMING || kind == CacheKind::READ_INVALIDATE; + }; + + if (!isValidHint(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isValidHint(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isValidHint(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + + if (!tdescTy.getScattered()) return emitOpError("Invalid TensorDesc. PrefetchOp only works on " "TensorDescs with ScatteredAttr."); - if (mode != Mode::VC || mapping) { - return emitOpError("PrefetchOp only supports VC mode. and mapping " + if (mode != ModeKind::VC || mapping) { + return emitOpError("PrefetchOp only supports VC mode, and mapping " "attribute of TensorDesc is not expected.\n"); } return success(); } -void PrefetchOp::build(OpBuilder &builder, OperationState &state, - Value TensorDesc, CacheReadHintAttr l1_hint, - CacheReadHintAttr l2_hint, CacheReadHintAttr l3_hint) { +//===----------------------------------------------------------------------===// +// XeGPU_UpdateOffsetOp +//===----------------------------------------------------------------------===// +void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state, + Type result, Value TensorDesc, Value offsets) { state.addOperands(TensorDesc); - if (l1_hint) - state.getOrAddProperties().l1_hint = l1_hint; + state.addOperands(offsets); + state.getOrAddProperties().mode = + xegpu::ModeKindAttr::get(builder.getContext(), xegpu::ModeKind::VC); + state.addTypes(result); +} - if (l2_hint) - state.getOrAddProperties().l2_hint = l2_hint; +ParseResult UpdateOffsetOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands; + llvm::SmallVector Types; - if (l3_hint) - state.getOrAddProperties().l3_hint = l3_hint; + auto OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(Operands)) + return failure(); - state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + if (parser.parseTypeList(Types)) + return failure(); + + if (parser.parseArrow()) + return failure(); + + llvm::SmallVector resultTypes(1); + if (parser.parseType(resultTypes[0])) + return failure(); + result.addTypes(resultTypes); + + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) + return failure(); + return success(); } -void PrefetchOp::build(OpBuilder &builder, OperationState &state, - Value TensorDesc, CacheReadHint l1_hint, - CacheReadHint l2_hint, CacheReadHint l3_hint) { - state.addOperands(TensorDesc); - state.getOrAddProperties().l1_hint = - CacheReadHintAttr::get(builder.getContext(), l1_hint); - state.getOrAddProperties().l2_hint = - CacheReadHintAttr::get(builder.getContext(), l2_hint); - state.getOrAddProperties().l3_hint = - CacheReadHintAttr::get(builder.getContext(), l3_hint); - ; - state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); +void UpdateOffsetOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + + printer << ' '; + printer << getTensorDesc(); + printer << ","; + printer << ' '; + printer << getOffsets(); + + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + printer << ' ' << ":"; + printer << ' '; + printer << getTensorDesc().getType(); + printer << ","; + printer << ' '; + printer << getOffsets().getType(); + printer << ' ' << "->"; + printer << ' '; + printer << getResult().getType(); } LogicalResult UpdateOffsetOp::verify() { + auto mode = getMode(); + if (mode != ModeKind::VC) + return emitOpError("UpdateOffsetOp only work on VC mode.\n"); + auto srcTy = getTensorDesc().getType(); - auto offTy = getOffsets().getType(); auto resTy = getResult().getType(); - if (srcTy != resTy) - return emitOpError( - "The result should have the same type" - "(shape and encoding attribute) as the input TensorDesc."); - - auto shape = srcTy.getShape(); + return emitOpError("The result should have the same type (shape and " + "encoding attribute) as the input TensorDesc."); if (!srcTy.getScattered()) { return emitOpError("Invalid TensorDesc. UpdateOffsetOp only works on " "TensorDescs with ScatteredAttr."); } - auto vecTy = llvm::dyn_cast(offTy); - if (!vecTy || vecTy.getRank() != 1) + auto offTy = llvm::dyn_cast(getOffsets().getType()); + if (!offTy || offTy.getRank() != 1) return emitOpError("The offset should be an 1D vector.\n"); - if (shape[0] != vecTy.getShape()[0]) + auto shape = srcTy.getShape(); + if (shape[0] != offTy.getShape()[0]) return emitOpError( "The offset should have same length as the dim-0 of TensorDesc."); return success(); } -LogicalResult UpdateNDOffsetOp::verify() { - // number of offsets specified must match the rank of the tensor descriptor - if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) { - return emitOpError("Invalid number of offsets."); +//===----------------------------------------------------------------------===// +// XeGPU_DpasOp +//===----------------------------------------------------------------------===// +ParseResult DpasOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands; + llvm::SmallVector Types; + + llvm::SMLoc OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(Operands)) + return failure(); + + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + if (parser.parseTypeList(Types)) + return failure(); + + if (parser.parseArrow()) + return failure(); + + llvm::SmallVector resultTypes(1); + if (parser.parseType(resultTypes[0])) + return failure(); + result.addTypes(resultTypes); + + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) + return failure(); + + return success(); +} + +void DpasOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + + printer << ' '; + printer << getLhs(); + printer << ","; + printer << ' '; + printer << getRhs(); + if (Value value = getAcc()) + printer << ", " << value; + + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + printer << ' ' << ":"; + printer << ' '; + printer << getLhs().getType(); + printer << ","; + printer << ' '; + printer << getRhs().getType(); + if (getAcc()) { + printer << ","; + printer << ' '; + printer << llvm::ArrayRef(getAcc().getType()); } + printer << ' ' << "->"; + printer << ' '; + printer << getResult().getType(); +} + +LogicalResult DpasOp::verify() { + int64_t lhsRank = getLhsType().getRank(); + int64_t rhsRank = getRhsType().getRank(); + Type lhsElemType = getLhsType().getElementType(); + Type rhsElemType = getRhsType().getElementType(); + + if (lhsElemType != rhsElemType) + return emitOpError("lhs and rhs element type does not match for dpas op"); + + if (getAcc() && getAccType() != getResultType()) + return emitOpError("Accumulator and Result for dpas op should have the " + "same type (both shape and element type)."); + + if (lhsRank != rhsRank || lhsRank != 3) + return emitOpError( + "lhs and rhs rank does not match for dpas op, or their rank is not 3."); + return success(); } +//===----------------------------------------------------------------------===// +// XeGPU_InvokeSIMDOp +//===----------------------------------------------------------------------===// void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state, SymbolRefAttr callee, TypeRange results, - ArgTypeAttr argType, ValueRange operands) { + ArgTypeKindAttr argType, ValueRange operands) { state.addOperands(operands); state.addAttribute("argType", argType); state.addAttribute("callee", callee); @@ -1682,25 +1729,20 @@ void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state, void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state, StringAttr callee, TypeRange results, - ArgTypeAttr argType, ValueRange operands) { + ArgTypeKindAttr argType, ValueRange operands) { build(builder, state, SymbolRefAttr::get(callee), results, argType, operands); } void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state, llvm::StringRef callee, TypeRange results, - ArgTypeAttr argType, ValueRange operands) { + ArgTypeKindAttr argType, ValueRange operands) { build(builder, state, StringAttr::get(builder.getContext(), callee), results, argType, operands); } -LogicalResult AtomicRMWOp::verify() { - auto mode = getMode(); - if (mode != Mode::VC) { - return emitOpError("AtomicRMWOp only work on VC mode.\n"); - } - return success(); -} - +//===----------------------------------------------------------------------===// +// XeGPU_AtomicRMWOp +//===----------------------------------------------------------------------===// void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result, AtomicRMWKindAttr kind, Value tensorDesc, Value mask, Value value) { @@ -1710,7 +1752,7 @@ void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result, state.addOperands(value); state.getOrAddProperties().kind = kind; state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); state.addTypes(result); } @@ -1724,10 +1766,161 @@ void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result, state.getOrAddProperties().kind = AtomicRMWKindAttr::get(builder.getContext(), kind); state.getOrAddProperties().mode = - ModeAttr::get(builder.getContext(), Mode::VC); + ModeKindAttr::get(builder.getContext(), ModeKind::VC); state.addTypes(result); } +ParseResult AtomicRMWOp::parse(OpAsmParser &parser, OperationState &result) { + llvm::SmallVector Operands; + llvm::SmallVector Types; + llvm::SMLoc OperandsLoc; + + llvm::SmallVector resultTypes(1); + + xegpu::AtomicRMWKindAttr kindAttr; + if (parser.parseCustomAttributeWithFallback(kindAttr, Type{})) + return failure(); + if (kindAttr) + result.getOrAddProperties().kind = kindAttr; + + OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(Operands)) + return failure(); + + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + if (parser.parseTypeList(Types)) + return failure(); + + if (parser.parseArrow()) + return failure(); + + if (parser.parseCustomTypeWithFallback(resultTypes[0])) + return failure(); + result.addTypes(resultTypes); + + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) + return failure(); + return success(); +} + +void AtomicRMWOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + + printer.printStrippedAttrOrType(getKindAttr()); + printer << ' '; + printer << getTensorDesc(); + printer << ","; + printer << ' '; + printer << getMask(); + if (Value value = getValue()) + printer << ", " << value; + + llvm::SmallVector elidedAttrs; + elidedAttrs.push_back("kind"); + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + printer << ' ' << ":"; + printer << ' '; + printer << getOperation()->getOperandTypes(); + printer << ' ' << "->"; + printer << ' '; + printer << getResult().getType(); +} + +LogicalResult AtomicRMWOp::verify() { + auto mode = getMode(); + if (mode != ModeKind::VC) + return emitOpError("AtomicRMWOp only work on VC mode.\n"); + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_CreateNbarrierOp +//===----------------------------------------------------------------------===// +ParseResult CreateNbarrierOp::parse(OpAsmParser &parser, + OperationState &result) { + llvm::SmallVector Operands; + llvm::SmallVector Types; + llvm::SMLoc OperandsLoc; + + OperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(Operands)) + return failure(); + + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + if (parser.parseLParen()) + return failure(); + + if (parser.parseTypeList(Types)) + return failure(); + + if (parser.parseRParen()) + return failure(); + + if (parser.parseArrow()) + return failure(); + + llvm::SmallVector resultTypes(1); + if (parser.parseType(resultTypes[0])) + return failure(); + + result.addTypes(resultTypes); + if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands)) + return failure(); + return success(); +} + +void CreateNbarrierOp::print(OpAsmPrinter &printer) { + auto mode = getMode(); + auto printDefaults = printDefaultValues(); + llvm::SmallVector elidedAttrs; + if (!printDefaults && mode == xegpu::ModeKind::SIMT) + elidedAttrs.push_back("mode"); + + printer << ' '; + printer << getNbarrierId(); + printer << ","; + printer << ' '; + printer << getNbarrierRole(); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + printer << ' ' << ":"; + printer << ' ' << "("; + printer << getNbarrierId().getType(); + printer << ","; + printer << ' '; + printer << getNbarrierRole().getType(); + printer << ")"; + printer << ' ' << "->"; + printer << ' '; + printer << getResult().getType(); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir index 24aa836f80d44..64a6f547fbd29 100644 --- a/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir @@ -9,12 +9,12 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: xegpu.create_nd_tdesc + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %2 = xegpu.create_nd_tdesc %src[2, 4] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> @@ -24,8 +24,7 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2} + // CHECK: xegpu.create_tdesc {{.*}} {chunk_size_per_lane = 2 : i64, mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> @@ -34,14 +33,12 @@ func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_load_nd_vc({{.*}}) { func.func @test_load_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: %arg0[%arg1, %arg2] + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu, vnni_axis = 0 : i64} // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> return @@ -52,71 +49,62 @@ func.func @test_store_nd_vc(%src: memref<24x32xf16>, %dst: memref<24x32xf16>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %3 = xegpu.load_nd %1 {mode=vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - // CHECK: xegpu.store_nd - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} + // CHECK: xegpu.store_nd {{%[0-9], %[0-9]}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> return } - // CHECK-LABEL: func @test_dpas_vc({{.*}}) { func.func @test_dpas_vc(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) { - // CHECK: xegpu.dpas + // CHECK: xegpu.dpas {{.*}} {mode = #xegpu} // CHECK-SAME: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> %1 = xegpu.dpas %a, %b {mode = vc}: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> return } - // CHECK-LABEL: func @test_update_nd_offset_vc({{.*}}) { func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd {{%[0-9]}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - // CHECK: xegpu.update_nd_offset + // CHECK: xegpu.update_nd_offset {{%[0-9]}}, [{{%c[0-9], %c[0-9]}}] {mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - %3 = xegpu.update_nd_offset %1, [%c0, %c1]: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc}: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> return } // CHECK-LABEL: func @test_prefetch_nd_vc({{.*}}) { func.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu} // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.prefetch_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd {{%[0-9]}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16> return } diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir index 25e7de33c6c12..f80df161a543a 100644 --- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir +++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir @@ -10,7 +10,7 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> - xegpu.atomic_rmw "addf" %1, %mask, %value {mode=vc} + xegpu.atomic_rmw #xegpu %1, %mask, %value {mode=vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32> return @@ -23,7 +23,7 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : // CHECK: xegpu.atomic_rmw // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32> - xegpu.atomic_rmw "mulf" %1, %mask, %value {mode=vc} + xegpu.atomic_rmw mulf %1, %mask, %value {mode=vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32> return @@ -36,7 +36,7 @@ func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32> - xegpu.atomic_rmw "andi" %1, %mask, %value {mode=vc} + xegpu.atomic_rmw andi %1, %mask, %value {mode=vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32> return diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir index 096451df04564..0f7229a02aa18 100644 --- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir @@ -10,7 +10,7 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> - xegpu.atomic_rmw "addf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32> + xegpu.atomic_rmw addf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32> return } @@ -21,7 +21,7 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> - xegpu.atomic_rmw "mulf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32> + xegpu.atomic_rmw mulf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32> return } @@ -32,7 +32,7 @@ func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> - xegpu.atomic_rmw "andi" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32> + xegpu.atomic_rmw andi %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32> return } diff --git a/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir index 9cbc8b4f7d94b..245d862e302a7 100644 --- a/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir @@ -7,8 +7,7 @@ // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> return @@ -16,8 +15,7 @@ func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) { func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> @@ -26,8 +24,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) { func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered> @@ -36,8 +33,7 @@ func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) { func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %arg0, %arg1 - // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> @@ -47,8 +43,7 @@ func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) { // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) { func.func @test_create_tdesc_vc_5(%src: memref, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu} // CHECK-SAME: memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2} : memref, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr> diff --git a/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir index 2689d401dc316..a3cb890483e63 100644 --- a/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir @@ -8,13 +8,11 @@ // CHECK-LABEL: func @test_load_gather_vc({{.*}}) { func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> @@ -24,31 +22,27 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) { // CHECK-LABEL: func @test_load_gather_vc_2({{.*}}) { func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16x8xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu, transpose = array} // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32> %2 = xegpu.load %1, %0 {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32> return } -// CHECK-LABEL: func @test_load_gather_vc_4({{.*}}) { -func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) { +// CHECK-LABEL: func @test_load_gather_vc_3({{.*}}) { +func.func @test_load_gather_vc_3(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 1} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir index d05a0b523c51d..0644565c3f002 100644 --- a/mlir/test/Dialect/XeGPU/IR/load_nd.mlir +++ b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir @@ -13,50 +13,38 @@ func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C : %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 1} - // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> - // CHECK-SAME: -> vector<4x1x2xf16> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> -> vector<4x1x2xf16> %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16> // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map> %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 0} - // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1x2xf16> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64} + // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map> -> vector<8x1x2xf16> %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> - // CHECK: xegpu.load_nd - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1xf32> + // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x1xf32> %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> -> vector<8x1xf32> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> %7 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 1} - // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> - // CHECK-SAME: -> vector<4x1x2xf16> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map> -> vector<4x1x2xf16> %8 = xegpu.load_nd %7 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> -> vector<4x1x2xf16> return @@ -70,39 +58,27 @@ func.func @test_load_nd_bf16(%A: memref<24x32xbf16>, %B : memref<24x32xbf16>, %C %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xbf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map> - %1 = xegpu.create_nd_tdesc %A[%c0, %c1] - : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> + %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 1} - // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map> - // CHECK-SAME: -> vector<4x1x2xbf16> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64} + // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map> -> vector<4x1x2xbf16> %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> -> vector<4x1x2xbf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xbf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16> // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map> - %3 = xegpu.create_nd_tdesc %B[%c0, %c1] - : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> + %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 0} - // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1x2xbf16> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64} + // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map> -> vector<8x1x2xbf16> %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> -> vector<8x1x2xbf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<24x32xbf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> - %5 = xegpu.create_nd_tdesc %C[%c0, %c1] - : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> + %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> - // CHECK: xegpu.load_nd - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1xf32> + // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x1xf32> %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_bf16_c> -> vector<8x1xf32> return @@ -116,39 +92,28 @@ func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : mem %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<64x64xi8> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8> // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map> - %1 = xegpu.create_nd_tdesc %A[%c0, %c1] - : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> + %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 1} - // CHECK-SAME: !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map> - // CHECK-SAME: -> vector<4x1x4xi8> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64} + // CHECK-SAME: !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map> -> vector<4x1x4xi8> %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x1x4xi8> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<64x64xi8> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8> // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map> - %3 = xegpu.create_nd_tdesc %B[%c0, %c1] - : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> + %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> - // CHECK: xegpu.load_nd - // CHECK-SAME: {vnni_axis = 0} - // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1x4xi8> + // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64} + // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map> -> vector<8x1x4xi8> %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x1x4xi8> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<64x64xi8> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8> // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map> - %5 = xegpu.create_nd_tdesc %C[%c0, %c1] - : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> + %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> - // CHECK: xegpu.load_nd - // CHECK-SAME: !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map> - // CHECK-SAME: -> vector<8x1xi32> + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map> -> vector<8x1xi32> %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> -> vector<8x1xi32> return diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir index 8703f171ac9df..78980b551c067 100644 --- a/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir @@ -10,36 +10,31 @@ func.func @test_load_nd_simd_f32(%src: memref<24x32xf32>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} - // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] + // CHECK-SAME: {mode = #xegpu} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc} - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {mode = #xegpu} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> %2 = xegpu.load_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - // CHECK: xegpu.load_nd - // CHECK-SAME:{mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint = streaming} - // CHECK-SAME:!xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, l3_hint = #xegpu, mode = #xegpu, transpose = array} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> %3 = xegpu.load_nd %1 {mode= vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming} : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> return } // CHECK-LABEL: func @test_load_nd_simd_f16({{.*}}) { func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: %arg0[%arg1, %arg2] - // CHECK-SAME: {mode = vc} - // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} - : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}] + // CHECK-SAME: {mode = #xegpu} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} - // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + // CHECK: xegpu.load_nd %{{[0-9]+}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu, vnni_axis = 0 : i64} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> return } @@ -47,13 +42,11 @@ func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index // CHECK-LABEL: func @test_load_nd_simd_bf16({{.*}}) { func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1] - // CHECK-SAME: {mode = vc} - // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xbf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] + // CHECK-SAME: {mode = #xegpu} : ui64 -> !xegpu.tensor_desc<8x16xbf16> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : ui64 -> !xegpu.tensor_desc<8x16xbf16> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, vnni_axis = 1, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu, vnni_axis = 1 : i64} // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16> %2 = xegpu.load_nd %1 {mode=vc, vnni_axis = 1, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16> @@ -62,14 +55,13 @@ func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index // CHECK-LABEL: func @test_load_nd_block_array_simd_f16({{.*}}) { func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[0, 0] {mode = #xegpu} // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> %1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc} : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> -> vector<2x8x16xf16> %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr> -> vector<2x8x16xf16> diff --git a/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir index aec7689e92e70..6e2cb4de4ce1d 100644 --- a/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir @@ -8,9 +8,11 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} + // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf32> + // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu} : !xegpu.tensor_desc<8x16xf32> xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32> return @@ -18,12 +20,14 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) { // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_1({{.*}}) { func.func @test_prefetch_nd_tdesc_vc_1(%src: memref<24x32xf16>, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}] + // CHECK-SAME: {mode = #xegpu} // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} - : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.prefetch_nd %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + + // CHECK: xegpu.prefetch_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} + // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16> return } @@ -34,9 +38,11 @@ func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} + // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> - // CHECK: xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xi8> + // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu} : !xegpu.tensor_desc<8x16xi8> xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xi8> return @@ -44,12 +50,13 @@ func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) { // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_bf16({{.*}}) { func.func @test_prefetch_nd_tdesc_vc_bf16(%src: memref<24x32xbf16>, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} - // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}] + // CHECK-SAME: {mode = #xegpu} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> - // CHECK: xegpu.prefetch_nd %0 {mode = vc, l1_hint = uncached, l2_hint = cached} : !xegpu.tensor_desc<8x16xbf16> + // CHECK: xegpu.prefetch_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} + // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> xegpu.prefetch_nd %1 {mode = vc, l1_hint = uncached, l2_hint = cached}: !xegpu.tensor_desc<8x16xbf16> return } diff --git a/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir index 695c189627e1a..170b3a9fe8147 100644 --- a/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir @@ -9,25 +9,21 @@ func.func @test_store_nd_vc_bf16(%src: memref<24x32xbf16>, %dst: memref<24x32xbf %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> - %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} - : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> + %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> - %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} - : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> + %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16> %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16> - // CHECK: xegpu.store_nd - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} + // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16> xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16> return @@ -38,25 +34,22 @@ func.func @test_store_nd_vc_f64(%src: memref<24x32xf64>, %dst: memref<24x32xf64> %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64> - %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} - : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64> + %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64> %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64> %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64> - // CHECK: xegpu.store_nd - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} + // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64> xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64> return @@ -67,25 +60,23 @@ func.func @test_store_nd_vc_i8(%src: memref<24x32xi8>, %dst: memref<24x32xi8>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8> %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8> - // CHECK: xegpu.store_nd - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} + // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8> xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8> return diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir index 4bc631acc5125..6d98ac3950c31 100644 --- a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir +++ b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir @@ -7,25 +7,21 @@ // CHECK-LABEL: func @test_store_scatter({{.*}}) { func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui64) { %0 = arith.constant dense: vector<16xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %2 = xegpu.create_tdesc %dst, %offsets {mode = vc} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> - // CHECK: xegpu.store - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} + // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir index d1e57ddda45e2..c1a51712e7003 100644 --- a/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir @@ -7,26 +7,22 @@ // CHECK-LABEL: func @test_store_scatter_vc({{.*}}) { func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %2 = xegpu.create_tdesc %dst, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> - // CHECK: xegpu.store - // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached} - // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> + // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} + // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> return diff --git a/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir index e25edcdd72b2a..1b97be77a2d79 100644 --- a/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir +++ b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir @@ -8,22 +8,20 @@ func.func @test_update_nd_offset_vc_0(%src: memref<24x32xf32>) { %c0 = arith.constant 2 : index %c1 = arith.constant 4 : index - // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: {mode = vc} - // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] + // CHECK-SAME: {mode = #xegpu} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: xegpu.load_nd - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load_nd %{{[0-9]}} + // CHECK-SAME: {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - // CHECK: xegpu.update_nd_offset + // CHECK: xegpu.update_nd_offset %{{[0-9]}}, [%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc} - : !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> return } diff --git a/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir index 0852484423693..05b0092d2379b 100644 --- a/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir +++ b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir @@ -7,14 +7,12 @@ // CHECK-LABEL: func @test_update_offset_VC({{.*}}) { func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc - // CHECK-SAME: {mode = vc} + // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu} // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> - // CHECK: xegpu.load - // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} + // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu, l2_hint = #xegpu, mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32> @@ -22,7 +20,7 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) { %3 = arith.constant dense<16>: vector<16 x index> %4 = arith.addi %offsets, %3: vector<16 x index> - // CHECK: xegpu.update_offset + // CHECK: xegpu.update_offset %{{[0-9]}}, %{{[0-9]}} {mode = #xegpu} // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> %5 = xegpu.update_offset %1, %4 {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>