diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 84112b8b18a81..ed3d9bbc77256 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,12 +10,11 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
-
 include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
                 string baseCppClass = "::mlir::Attribute">
-    : AttrDef<XeGPUDialect, name, traits, baseCppClass> {
+    : AttrDef<XeGPU_Dialect, name, traits, baseCppClass> {
   let mnemonic = attrMnemonic;
 }
 
@@ -49,7 +48,7 @@ def XeGPU_SgMapAttr: XeGPUAttr<"SubGroupMap", "sg_map"> {
 
 def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
   let parameters = (ins
-        DefaultValuedParameter<"xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
+        DefaultValuedParameter<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">: $memory_scope,
         DefaultValuedParameter<"int", "1">: $array_length,
         DefaultValuedParameter<"bool", "true">: $boundary_check,
         OptionalParameter<"xegpu::ScatteredAttr">: $scattered,
@@ -58,7 +57,7 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
 
   let builders = [
     AttrBuilder<(ins
-      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">:$memory_scope,
+      CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">:$memory_scope,
       CArg<"int", "1">:$array_length,
       CArg<"xegpu::ScatteredAttr", "{}">:$scattered,
       CArg<"xegpu::SubGroupMapAttr", "{}">:$map
@@ -72,65 +71,80 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
   let hasCustomAssemblyFormat = true;
 }
 
-def XeGPU_ArgTypeAttr : I32EnumAttr<
-    "ArgType", "", [ I32EnumAttrCase<"Vector", 0, "vector">,
-                  I32EnumAttrCase<"Scalar", 1, "scalar"> ]> {
-    let cppNamespace = "::mlir::xegpu";
+def ARG_TYPE_VECTOR : I32EnumAttrCase<"VECTOR", 0, "vector">; 
+def ARG_TYPE_SCALAR : I32EnumAttrCase<"SCALAR", 1, "scalar">; 
+def XeGPU_ArgTypeKind : I32EnumAttr<"ArgTypeKind", 
+               "Argument type for Invoke_SIMD op", 
+  [ARG_TYPE_VECTOR, ARG_TYPE_SCALAR]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_ModeAttr : I32EnumAttr<
-    "Mode", "", [ I32EnumAttrCase<"SIMT", 0, "simt">,
-                  I32EnumAttrCase<"VC", 1, "vc"> ]> {
-    let cppNamespace = "::mlir::xegpu";
+def MODE_SIMT : I32EnumAttrCase<"SIMT", 0, "simt">;
+def MODE_VC : I32EnumAttrCase<"VC", 1, "vc">;
+def XeGPU_ModeKind : I32EnumAttr<"ModeKind", 
+             "The Mode an operator runs on", 
+  [MODE_SIMT, MODE_VC]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_MemoryScopeAttr : I32EnumAttr<
-    "MemoryScope", "", [ I32EnumAttrCase<"GLOBAL", 0, "global">,
-                         I32EnumAttrCase<"SLM", 1, "slm"> ]> {
-    let cppNamespace = "::mlir::xegpu";
+def MEMORY_SCOPE_GLOBAL: I32EnumAttrCase<"GLOBAL", 0, "global">;
+def MEMORY_SCOPE_SHARED: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", 
+      "The scope of the memory the tensor descritor is created for", 
+  [MEMORY_SCOPE_GLOBAL, MEMORY_SCOPE_SHARED]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_CacheReadAttr : I32EnumAttr<
-    "CacheReadHint", "", [ I32EnumAttrCase<"UNCACHED", 0, "uncached">,
-                           I32EnumAttrCase<"CACHED", 1, "cached">,
-                           I32EnumAttrCase<"STREAMING", 2, "streaming">,
-                           I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate"> ]> {
+def CACHE_KIND_CACHED:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def CACHE_KIND_UNCACHED:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def CACHE_KIND_STREAMING:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def CACHE_KIND_INVALIDATE:    I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def CACHE_KIND_WRITE_BACK:    I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def CACHE_KIND_WRITE_THROUGH: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
 
-    let cppNamespace = "::mlir::xegpu";
-}
 
-def XeGPU_CacheWriteAttr : I32EnumAttr<
-    "CacheWriteHint", "", [ I32EnumAttrCase<"UNCACHED", 0, "uncached">,
-                            I32EnumAttrCase<"WRITE_THROUGH", 1, "write_through">,
-                            I32EnumAttrCase<"WRITE_BACK", 2, "write_back">,
-                            I32EnumAttrCase<"STREAMING", 3, "streaming"> ]> {
 
-    let cppNamespace = "::mlir::xegpu";
+def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", 
+  [CACHE_KIND_CACHED, CACHE_KIND_UNCACHED, 
+   CACHE_KIND_STREAMING, CACHE_KIND_INVALIDATE,
+   CACHE_KIND_WRITE_BACK, CACHE_KIND_WRITE_THROUGH]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
 }
 
+def XeGPU_ArgTypeAttr : EnumAttr<XeGPU_Dialect, XeGPU_ArgTypeKind, "arg_type_kind">;
+def XeGPU_ModeAttr : EnumAttr<XeGPU_Dialect, XeGPU_ModeKind, "mode_kind">;
+def XeGPU_MemoryScopeAttr : EnumAttr<XeGPU_Dialect, XeGPU_MemoryScopeKind, "memory_scope_kind">;
+def XeGPU_CacheAttr : EnumAttr<XeGPU_Dialect, XeGPU_CacheKind, "cache_kind">;
+
 // RMW kind attribute
-def ATOMIC_RMW_KIND_ADDF    : I64EnumAttrCase<"addf", 0>;
-def ATOMIC_RMW_KIND_ADDI    : I64EnumAttrCase<"addi", 1>;
-def ATOMIC_RMW_KIND_ASSIGN  : I64EnumAttrCase<"assign", 2>;
-def ATOMIC_RMW_KIND_MAXF    : I64EnumAttrCase<"maxf", 3>;
-def ATOMIC_RMW_KIND_MAXS    : I64EnumAttrCase<"maxs", 4>;
-def ATOMIC_RMW_KIND_MAXU    : I64EnumAttrCase<"maxu", 5>;
-def ATOMIC_RMW_KIND_MINF    : I64EnumAttrCase<"minf", 6>;
-def ATOMIC_RMW_KIND_MINS    : I64EnumAttrCase<"mins", 7>;
-def ATOMIC_RMW_KIND_MINU    : I64EnumAttrCase<"minu", 8>;
-def ATOMIC_RMW_KIND_MULF    : I64EnumAttrCase<"mulf", 9>;
-def ATOMIC_RMW_KIND_MULI    : I64EnumAttrCase<"muli", 10>;
-def ATOMIC_RMW_KIND_ORI     : I64EnumAttrCase<"ori", 11>;
-def ATOMIC_RMW_KIND_ANDI    : I64EnumAttrCase<"andi", 12>;
-
-def XeGPU_AtomicRMWKindAttr : I64EnumAttr<
-    "AtomicRMWKind", "",
-    [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN,
-     ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU,
-     ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU,
-     ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI,
-     ATOMIC_RMW_KIND_ANDI]> {
+def ATOMIC_RMW_KIND_ADDF    : I32EnumAttrCase<"addf", 0>;
+def ATOMIC_RMW_KIND_ADDI    : I32EnumAttrCase<"addi", 1>;
+def ATOMIC_RMW_KIND_ASSIGN  : I32EnumAttrCase<"assign", 2>;
+def ATOMIC_RMW_KIND_MAXF    : I32EnumAttrCase<"maxf", 3>;
+def ATOMIC_RMW_KIND_MAXS    : I32EnumAttrCase<"maxs", 4>;
+def ATOMIC_RMW_KIND_MAXU    : I32EnumAttrCase<"maxu", 5>;
+def ATOMIC_RMW_KIND_MINF    : I32EnumAttrCase<"minf", 6>;
+def ATOMIC_RMW_KIND_MINS    : I32EnumAttrCase<"mins", 7>;
+def ATOMIC_RMW_KIND_MINU    : I32EnumAttrCase<"minu", 8>;
+def ATOMIC_RMW_KIND_MULF    : I32EnumAttrCase<"mulf", 9>;
+def ATOMIC_RMW_KIND_MULI    : I32EnumAttrCase<"muli", 10>;
+def ATOMIC_RMW_KIND_ORI     : I32EnumAttrCase<"ori", 11>;
+def ATOMIC_RMW_KIND_ANDI    : I32EnumAttrCase<"andi", 12>;
+
+def XeGPU_AtomicRMWKind : I32EnumAttr<"AtomicRMWKind", 
+                       "Operation type for AtomicRMW",
+  [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN,
+   ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU,
+   ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU,
+   ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI,
+   ATOMIC_RMW_KIND_ANDI]> {
+  let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
+def XeGPU_AtomicRMWKindAttr : EnumAttr<XeGPU_Dialect, XeGPU_AtomicRMWKind, "atomic_rmw_kind">;
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index ae29f87a8812a..f85ccb32cc43b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -22,31 +22,23 @@ include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/ShapedOpInterfaces.td"
 
-
-// Provide a definition of the 'XeGPU' dialect in the ODS framework so that we
-// can define our operations.
-def XeGPUDialect : Dialect {
-    // The namespace of our dialect
+def XeGPU_Dialect : Dialect {
     let name = "xegpu";
-
-    // A short one-line summary of our dialect.
+    let cppNamespace = "::mlir::xegpu";
     let summary = "The XeGPU dialect that models Intel GPU's ISA";
-
-    // A longer description of our dialect.
     let description = [{
-            The XeGPU dialect models Intel Xe ISA semantics but works at vector and
-            TensorDesc data type. It provides 1:1 mappings to match Xe instructions like
-            DPAS and 2D block load. The matrix size being processed at this level
-            exactly matches the hardware instructions or the intrinsic supported by
-            the lower-level GPU compiler.
-        }];
-
-    // The C++ namespace that the dialect class definition resides in.
-    let cppNamespace = "::mlir::xegpu";
-
-    let dependentDialects = ["::mlir::memref::MemRefDialect"];
+      The XeGPU dialect models Intel Xe ISA semantics but works at vector and
+      TensorDesc data type. It provides 1:1 mappings to match Xe instructions 
+      like DPAS and 2D block load. The matrix size being processed at this level
+      exactly matches the hardware instructions or the intrinsic supported by
+      the lower-level GPU compiler.
+    }];
+
+    let dependentDialects = [
+      "arith::ArithDialect",
+      "memref::MemRefDialect"
+    ];
 
-    // TODO: temporary disable it.
     let useDefaultTypePrinterParser = true;
     let useDefaultAttributePrinterParser = true;
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6866f903d715e..766590f6a3f87 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -19,35 +19,36 @@ include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
 //   * The parent dialect of the operation.
 //   * The mnemonic for the operation, or the name without the dialect prefix.
 //   * A list of traits for the operation.
-class XeGPU_Op<string mnemonic, list<Trait> traits = []> :
-          Op<XeGPUDialect, mnemonic, traits>;
+class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
+          Op<XeGPU_Dialect, mnemonic, traits>;
 
-def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
 
   let summary = "create nd tensor descriptor operation";
   let description = [{
     The "create_nd_tdesc" operation creates a TensorDescType which represents
     a sub-view of a 2D memory region (It can be extended to support N-D memory
-    region if needed in future). Elements in the subview continuous in each dimention.
-    It encodes the following important information for supporting intel hardware features:
-
-    * source: an object representing (starting address/pointer of) a 2D memory reagion. It can
-              be either a 2D memref object, or simply a pointer represented by uint64_t type.
-    * offsets: two index values represents offsets from the "source" at the each dimension at
-               which the subview of the target memory will be created. It is encoded via two
-               variables, including "dynamic_offsets" and "static_offsets", such that it can
-               accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
-    * shape: the shape information of the memory region pointed by the "source".  It is typically
-             encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. But if "source"
-             is simply a pointer represented as uint64_t type, or a memref type without shape information
-             e.g., memref<?x?xf16>, the shape information has to be explicitly passed via the "dynamic_shape"
-             argument. Currently "dynamic_shape" only accepts operands(e.g., [%c4096, %c4096]),
-             not attributes(e.g., [4096, 4096]).
-    * strides: the strides of the memory region pointed by the "source". Similar to shape, it is typically
-             encoded via the MemRefType of the source too. But if "source" is simply a pointer represented
-             as uint64_t type, or a memref type without shape information e.g., memref<?x?xf16>, the strides
-             information has to be explicitly passed via the "dynamic_strides" argument. And it currently
-             only accepts operands two.
+    region if needed in future). Elements in the subview continuous in each 
+    dimention. It encodes the following important information for supporting 
+    Intel hardware features:
+
+    * source: an object representing (starting address/pointer of) a 2D memory reagion. 
+        It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+    * offsets: two index values represents offsets from the "source" at the each dimension 
+        at which the subview of the target memory will be created. It is encoded via two
+        variables, including "dynamic_offsets" and "static_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+    * shape: the shape information of the memory region pointed by the "source".  It is 
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
+        But if "source" is simply a pointer represented as uint64_t type, or a memref 
+        type without shape information e.g., memref<?x?xf16>, the shape information has 
+        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
+        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
+        it is typically encoded via the MemRefType of the source too. But if "source" is 
+        simply a pointer represented as uint64_t type, or a memref type without shape 
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
+        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
 
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
     %0 = memref.alloc() : memref<32x24xf32>
@@ -68,30 +69,32 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
     %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
   }];
 
-  let arguments = (ins XeGPU_BaseAddrType: $source,
-                       Variadic<Index>: $dynamic_offsets,
-                       Variadic<Index>: $dynamic_shape,
-                       Variadic<Index>: $dynamic_strides,
-                       DenseI64ArrayAttr: $static_offsets,
-                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode);
-
+  let arguments = (ins XeGPU_BaseAddrType: $source, 
+                 Variadic<Index>: $dynamic_offsets, 
+                 Variadic<Index>: $dynamic_shape, 
+                 Variadic<Index>: $dynamic_strides,
+                 DenseI64ArrayAttr: $static_offsets,
+                 DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
   let results = (outs XeGPU_TensorDesc:$TensorDesc);
 
   let hasCustomAssemblyFormat = 1;
-
   let skipDefaultBuilders = 1;
+  let hasVerifier = 1;
 
   let builders = [
-    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets,
-                   "ValueRange": $shape, "ValueRange": $strides, "::llvm::ArrayRef<int64_t>": $static_offsets,
-                    CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)>,
+    OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, 
+                   "ValueRange": $shape, "ValueRange": $strides, 
+                   "llvm::ArrayRef<int64_t>": $static_offsets,
+                    CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
 
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, "::llvm::ArrayRef<OpFoldResult>": $offsets,
-                   CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)>,
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
 
-    OpBuilder<(ins "Type": $tdesc, "Value": $source, "::llvm::ArrayRef<OpFoldResult>": $offsets,
+    OpBuilder<(ins "Type": $tdesc, "Value": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
                    "ValueRange": $shape, "ValueRange": $stride,
-                   CArg<"xegpu::Mode", "xegpu::Mode::SIMT">: $mode)>
+                   CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>
   ];
 
   let extraClassDeclaration = [{
@@ -124,7 +127,6 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
     /// strides information from  memref type will be ignored.
     llvm::SmallVector<OpFoldResult> getStrides();
 
-
     /// return the shape embeded in the memref type of the source.
     /// If source is not memref type. array of kDynamic will be returned.
     llvm::ArrayRef<int64_t> getStaticShape();
@@ -133,92 +135,37 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
     /// If source is not memref type. array of kDynamic will be returned.
     llvm::ArrayRef<int64_t> getStaticStrides();
 
-
     /// Return the element type of the TensorDesc
     Type getElementType();
 
     /// Return the shape of the TensorDesc
     llvm::ArrayRef<int64_t> getTensorDescShape();
-
-
-
   }];
 
-  let hasVerifier = 1;
-}
-
-def XeGPU_CreateDescOp
-    : XeGPU_Op<"create_tdesc", [Pure]> {
-
-  let summary = "create scattered tensor descritors (TensorDesc).";
-  let description = [{
-    "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates a TensorDesc for a memory region.
-    while "create_nd_tdesc" is for creating continious subviews, "create_tdesc" is for creating non-continious
-    (scattered) subviews. It only works with VectorCompute (VC) mode and accepts the following parameters:
-
-    * source: a 1D memref or pointer (uint64_t) represents the memory object.
-    * offsets: It is a 1D vector containing offsets of each access point, the size should be aligned with
-               supportted group size, e.g., vector<16xindex>. And each element in the vector corresponds to a
-               work item (SIMT lane) in the subgroup.
-    * chunk_size_per_lane: [optional attribute] indicates number of continious elements accessed for each offset, default is 1.
-
-    Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
-    %a = memref.alloc() : memref<1024xf32>
-    %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
-    %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32>
-
-    Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
-               It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
-    %0 = memref.alloc() : memref<1024xf32>
-    %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
-    %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
-  }];
-
-  let arguments = (ins XeGPU_BaseAddrType: $source,
-                       XeGPU_OffsetType: $offsets,
-                       DefaultValuedAttr<I32Attr, "1">: $chunk_size_per_lane,
-                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode);
-
-  let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
-  let builders = [
-    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
-                  "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>,
-
-    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
-                  "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)>
-  ];
-  let skipDefaultBuilders = 1;
-
-  // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
-  //                   : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
 }
 
-
 def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
-  let summary = "loads a n-D block from global memory (represented by TensorDesc) to registers (represented by vector)";
+  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+                "to registers (represented by vector)";
   let description = [{
-    LoadNDOp essentially mimics the hardware block read instruction to read a block of data from memory to register.
-    It takes a set of cache hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
-    Corresponding cache hint attribute will be masked.
-
-    If both transpose and vnni_axis present at the same time. it assume to perform transpose first and then vnni transform.
+    LoadNDOp essentially mimics the hardware block read instruction to read 
+    a block of data from memory to register. It takes a set of cache hints 
+    for each level of cache, L1, L2 and L3. If hardware does not have a 
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    If both transpose and vnni_axis present at the same time. It assume to 
+    perform transpose first and then vnni transform.
   }];
 
-  let arguments = (ins
-    XeGPU_TensorDesc: $TensorDesc,
-    OptionalAttr<I32Attr>: $vnni_axis,
-    OptionalAttr<DenseI64ArrayAttr>: $transpose,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode);
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
   let results = (outs XeGPU_ValueType: $value);
 
   let extraClassDeclaration = [{
-
     VectorType getValueType() {
       return llvm::dyn_cast<VectorType>(getValue().getType());
     }
@@ -226,26 +173,22 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
     xegpu::TensorDescType getTensorDescType() {
       return getTensorDesc().getType();
     }
-
   }];
 
   // Format: xegpu.load_nd %1 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming}
   //                          : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
   let hasCustomAssemblyFormat = 1;
-
   let hasVerifier = 1;
 }
 
 def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-  let arguments = (ins
-    XeGPU_TensorDesc: $TensorDesc,
-    XeGPU_ValueType: $value,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l3_hint,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-    );
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
 
   // Format: xegpu.store_nd %3, %2 {l1_hint = write_back, l2_hint = uncached}
   //                        : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
@@ -256,13 +199,12 @@ def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
 def XeGPU_PrefetchNDOp : XeGPU_Op<"prefetch_nd", []> {
   let summary = "prefetches a nD block to cache";
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
-            DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-  );
+                       OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
 
-  // In format of: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}:
+  // Format: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}:
   //                                    !xegpu.tensor_desc<8x16xf16>
   let hasCustomAssemblyFormat = 1;
 }
@@ -273,85 +215,93 @@ def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
   let arguments = (ins
     XeGPU_TensorDesc: $TensorDesc,
     Variadic<Index>: $offsets,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode);
+    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
 
   let results = (outs XeGPU_TensorDesc: $result);
 
-  let assemblyFormat = [{
-    $TensorDesc `,` (`[` $offsets^ `]`)? (`{` `mode` `=` $mode^ `}`)?
-    attr-dict `:` qualified(type($TensorDesc)) `->` qualified(type($result))
-  }];
-
+  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure]> {
+  let summary = "create scattered tensor descritors (TensorDesc).";
+  let description = [{
+    "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates 
+    a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" 
+    is for creating continious subviews, "create_tdesc" is for creating non-continious
+    (scattered) subviews. It is designed only works with VectorCompute (VC) mode and 
+    accepts the following parameters:
 
-def XeGPU_DpasOp : XeGPU_Op<"dpas"> {
-  let summary = "performs dpas computation";
-  let arguments = (ins
-    XeGPU_DpasOpType : $lhs,
-    XeGPU_DpasOpType : $rhs,
-    Optional<XeGPU_Vector2DType>: $acc,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-  );
-  let results = (outs XeGPU_Vector2DType: $result);
-  let assemblyFormat = [{
-     $lhs `,` $rhs (`,` $acc^)? (` ``{` `mode` `=` $mode^ `}`)? attr-dict `:`
-     qualified(type($lhs)) `,` qualified(type($rhs)) (`,` qualified(type($acc))^)? `->` qualified(type($result))
-  }];
+    * source: a 1D memref or pointer (uint64_t) represents the memory object.
+    * offsets: It is a 1D vector containing offsets of each access point, the supportted 
+          group size, e.g., vector<16xindex>. And each element in the vector corresponds 
+          to a work item (SIMT lane) in the subgroup.
+    * chunk_size_per_lane: [optional attribute] indicates number of continious elements 
+          accessed for each offset, default is 1.
 
-  let extraClassDeclaration = [{
-    VectorType getLhsType() {
-      return ::llvm::cast<VectorType>(getLhs().getType());
-    }
+    Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+    %a = memref.alloc() : memref<1024xf32>
+    %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+    %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32>
 
-    VectorType getRhsType() {
-      return ::llvm::cast<VectorType>(getRhs().getType());
-    }
+    Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+               It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+    %0 = memref.alloc() : memref<1024xf32>
+    %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+    %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+  }];
 
-    VectorType getAccType() {
-      return ::llvm::cast<VectorType>(getAcc().getType());
-    }
+  let arguments = (ins XeGPU_BaseAddrType: $source,
+                       XeGPU_OffsetType: $offsets,
+                       DefaultValuedAttr<I64Attr, "1">: $chunk_size_per_lane,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+  let results = (outs XeGPU_TensorDesc:$TensorDesc);
 
-    VectorType getResultType() { return getResult().getType(); }
-  }];
+  let builders = [
+    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+                  "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>,
+    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+                  "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)>
+  ];
+  let skipDefaultBuilders = 1;
 
+  // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
+  //              : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
 def XeGPU_LoadGatherOp : XeGPU_Op<"load"> {
   let summary = "load a scalar at source[offset].";
 
-  let arguments = (ins
-    XeGPU_TensorDesc: $TensorDesc,
-    XeGPU_MaskType: $mask,
-    OptionalAttr<I32Attr>: $vnni_axis,
-    OptionalAttr<XeGPU_IntArrayAttr2>: $transpose,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-  );
-
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       XeGPU_MaskType: $mask,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
   let results = (outs XeGPU_ValueType: $value);
 
   let builders = [
-    OpBuilder<(ins "Type": $value, "Value": $TensorDesc, "Value": $mask, "IntegerAttr": $vnni_axis,
-                    CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose,
-                    CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l1_hint,
-                    CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l2_hint,
-                    CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l3_hint)>,
-
-    OpBuilder<(ins "Type": $value, "Value": $TensorDesc, "Value": $mask, "IntegerAttr": $vnni_axis,
-                    CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose,
-                    CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l1_hint,
-                    CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l2_hint,
-                    CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l3_hint)>
-
+    OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc, 
+                   "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+                   CArg<"mlir::DenseI64ArrayAttr", "mlir::DenseI64ArrayAttr()">: $transpose,
+                   CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+                   CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+                   CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+
+    OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc, 
+                   "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+                   CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose,
+                   CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+                   CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+                   CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
   ];
   let skipDefaultBuilders = 1;
 
-  // In format of: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
+  // Format: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
   //                 : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
@@ -364,22 +314,21 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> {
     XeGPU_ValueType: $value,
     XeGPU_TensorDesc: $TensorDesc,
     XeGPU_MaskType: $mask,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheWriteAttr>: $l3_hint,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
+    OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+    OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+    OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
   );
 
   let builders = [
     OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
-                CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l1_hint,
-                CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l2_hint,
-                CArg<"xegpu::CacheWriteHintAttr", "xegpu::CacheWriteHintAttr()">: $l3_hint)>,
-
+        CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+        CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+        CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
     OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
-                CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l1_hint,
-                CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l2_hint,
-                CArg<"xegpu::CacheWriteHint", "xegpu::CacheWriteHint::WRITE_BACK">: $l3_hint)>
+        CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l1_hint,
+        CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l2_hint,
+        CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l3_hint)>
   ];
   let skipDefaultBuilders = 1;
 
@@ -389,71 +338,86 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> {
   let hasVerifier = 1;
 }
 
-def XeGPU_UpdateOffsetOp
-    : XeGPU_Op<"update_offset", []> {
-      let summary = "update the offsets for the given tensor descriptor";
-
-      let arguments = (ins
-        XeGPU_TensorDesc: $TensorDesc,
-        XeGPU_OffsetType: $offsets,
-        DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-      );
-
-      let results = (outs XeGPU_TensorDesc: $result);
-
-      let builders = [
-        OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets), [{
-          $_state.addOperands(TensorDesc);
-          $_state.addOperands(offsets);
-          $_state.getOrAddProperties<Properties>().mode = xegpu::ModeAttr::get($_builder.getContext(), xegpu::Mode::VC);
-          $_state.addTypes(result);
-        }]>
-      ];
-
-      let skipDefaultBuilders = 1;
-
-      let assemblyFormat = [{
-        $TensorDesc `,` $offsets (`{` `mode` `=` $mode^ `}`)?
-        attr-dict `:` qualified(type($TensorDesc)) `,` qualified(type($offsets)) `->`  qualified(type($result))
-      }];
-
-      let hasVerifier = 1;
-  }
-
 def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let summary = "prefetches a nD block to cache";
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
-            OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
-            DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-  );
+         OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+         OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+         OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+         DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
 
   let builders = [
     OpBuilder<(ins "Value": $TensorDesc,
-      CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l1_hint,
-      CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l2_hint,
-      CArg<"xegpu::CacheReadHintAttr", "xegpu::CacheReadHintAttr()">: $l3_hint)>,
-
+      CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+      CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+      CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
     OpBuilder<(ins "Value": $TensorDesc,
-      CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l1_hint,
-      CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l2_hint,
-      CArg<"xegpu::CacheReadHint", "xegpu::CacheReadHint::CACHED">: $l3_hint)>
+      CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+      CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+      CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
   ];
 
   let skipDefaultBuilders = 1;
+  let hasVerifier = 1;
 
-  // In format of: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}:
+  // Format: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}:
   //                                    !xegpu.tensor_desc<8x16xf16>
   let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", []> {
+  let summary = "update the offsets for the given tensor descriptor";
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       XeGPU_OffsetType: $offsets,
+                       DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+  let results = (outs XeGPU_TensorDesc: $result);
+
+  let builders = [
+    OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets)>
+  ];
+
+  let skipDefaultBuilders = 1;
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
+def XeGPU_DpasOp : XeGPU_Op<"dpas"> {
+  let summary = "performs dpas computation";
+  let arguments = (ins
+    XeGPU_DpasOpType : $lhs,
+    XeGPU_DpasOpType : $rhs,
+    Optional<XeGPU_Vector2DType>: $acc,
+    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+  );
+  let results = (outs XeGPU_Vector2DType: $result);
+  let hasCustomAssemblyFormat = 1;
+
+  let extraClassDeclaration = [{
+    VectorType getLhsType() {
+      return ::llvm::cast<VectorType>(getLhs().getType());
+    }
+
+    VectorType getRhsType() {
+      return ::llvm::cast<VectorType>(getRhs().getType());
+    }
+
+    VectorType getAccType() {
+      return ::llvm::cast<VectorType>(getAcc().getType());
+    }
+
+    VectorType getResultType() { 
+      return getResult().getType(); 
+    }
+  }];
+
   let hasVerifier = 1;
 }
 
 def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> {
     let summary = "Invoke_SIMD operation";
     let description = [{
-      The `xegpu.invoke_SIMD` operation works similar to a direct call to a function. But it is
-      special to Intel GPU.
+      The `xegpu.invoke_SIMD` operation works similar to a direct call to a function. 
+      But it is special to Intel GPU.
     }];
 
   let arguments = (ins FlatSymbolRefAttr:$callee,
@@ -463,15 +427,12 @@ def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> {
 
   let builders = [
     OpBuilder<(ins "SymbolRefAttr":$callee, "TypeRange":$results,
-      "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
-
+      "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
     OpBuilder<(ins "StringAttr":$callee, "TypeRange":$results,
-      "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
-
+      "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
     OpBuilder<(ins "llvm::StringRef":$callee, "TypeRange":$results,
-      "xegpu::ArgTypeAttr":$argType, CArg<"ValueRange", "{}">:$operands)>
+      "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>
   ];
-
 }
 
 def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> {
@@ -481,107 +442,64 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> {
     XeGPU_TensorDesc:$tensorDesc,
     XeGPU_MaskType:$mask,
     Optional<XeGPU_ValueType>:$value,
-    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
+    DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
   );
+
   let results = (outs XeGPU_ValueType:$result);
-  let assemblyFormat = [{
-    $kind $tensorDesc `,` $mask (`,` $value^)? (`{` `mode` `=` $mode^ `}`)? attr-dict `:` qualified(type(operands)) `->` type($result)
-  }];
+  let hasCustomAssemblyFormat = 1;
 
   let builders = [
     OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKindAttr": $kind,
-                   "Value": $tensorDesc, "Value": $mask,
-                   "Value": $value)>,
-
+                   "Value": $tensorDesc, "Value": $mask, "Value": $value)>,
     OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKind": $kind,
-                   "Value": $tensorDesc, "Value": $mask,
-                   "Value": $value)>
+                   "Value": $tensorDesc, "Value": $mask, "Value": $value)>
   ];
 
   let skipDefaultBuilders = 1;
-
   let hasVerifier = 1;
 }
 
-
 def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
-      let summary = "allocate a specific number of named barriers.";
-      let arguments = (ins I32Attr: $nbarrierCount);
-      let assemblyFormat = "$nbarrierCount attr-dict";
+  let summary = "allocate a specific number of named barriers.";
+  let arguments = (ins I64Attr: $nbarrierCount);
+  let assemblyFormat = "$nbarrierCount attr-dict";
+}
+
+
+def XeGPU_CreateNbarrierOp: XeGPU_Op<"create_nbarrier", []> {
+  let summary = "create a named barrier.";
+  let arguments = (ins I8: $nbarrier_id,
+          I8: $nbarrier_role,
+          I8Attr: $num_producers,
+          I8Attr: $num_consumers,
+          DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+  let results = (outs XeGPU_Nbarrier: $result);
+  let hasCustomAssemblyFormat = 1;
 }
 
+def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> {
+  let summary = "arrive at a named barrier.";
+  let arguments = (ins XeGPU_Nbarrier: $payload);
+  let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload))}];
+}
 
-def XeGPU_CreateNbarrierOp
-  : XeGPU_Op<"create_nbarrier", []> {
-      let summary = "create a named barrier.";
-
-      let arguments = (ins
-        I8: $nbarrier_id,
-        I8: $nbarrier_role,
-        I8Attr: $num_producers,
-        I8Attr: $num_consumers,
-        DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::Mode::SIMT">: $mode
-      );
-
-      let results = (outs XeGPU_Nbarrier: $result);
-
-      let assemblyFormat = [{
-        $nbarrier_id `,` $nbarrier_role
-        attr-dict `:` `(` qualified(type($nbarrier_id)) `,` qualified(type($nbarrier_role)) `)`
-        `->` qualified(type($result))
-      }];
-
-      // let hasVerifier = 1;
-  }
-
-def XeGPU_NbarrierArriveOp
-  : XeGPU_Op<"nbarrier_arrive", []> {
-      let summary = "arrive at a named barrier.";
-
-      let arguments = (ins
-        XeGPU_Nbarrier: $payload
-      );
-
-      let assemblyFormat = [{
-        $payload attr-dict `:` qualified(type($payload))
-      }];
-  }
-
-def XeGPU_NbarrierWaitOp
-  : XeGPU_Op<"nbarrier_wait", []> {
-      let summary = "wait for a named barrier.";
-
-      let arguments = (ins
-        XeGPU_Nbarrier: $payload
-      );
-
-      let assemblyFormat = [{
-        $payload attr-dict `:` qualified(type($payload))
-      }];
-  }
-
-def XeGPU_CompileHintOp
-  : XeGPU_Op<"compile_hint", []> {
-      let summary = "prevents the compiler from scheduling.";
-
-      let assemblyFormat = [{
-        attr-dict
-      }];
-  }
-
-def XeGPU_MfenceOp
-  : XeGPU_Op<"mfence", []> {
-      let summary = "lsc fence.";
-
-      let arguments = (ins
-        StrAttr: $memory_kind,
-        StrAttr: $fence_op,
-        StrAttr: $fence_scope
-      );
-
-      let assemblyFormat = [{
-        attr-dict
-      }];
-  }
+def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> {
+  let summary = "wait for a named barrier.";
+  let arguments = (ins XeGPU_Nbarrier: $payload);
+  let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload)) }];
+}
+
+def XeGPU_CompileHintOp: XeGPU_Op<"compile_hint", []> {
+  let summary = "prevents the compiler from scheduling.";
+  let assemblyFormat = [{ attr-dict }];
+}
+
+def XeGPU_MfenceOp: XeGPU_Op<"mfence", []> {
+  let summary = "lsc fence.";
+  let arguments = (ins StrAttr: $memory_kind,
+                       StrAttr: $fence_op,
+                       StrAttr: $fence_scope);
+  let assemblyFormat = [{ attr-dict }];
+}
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 8d2f1e769c304..b3dceff9587ad 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -15,7 +15,6 @@ include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 
 // An Integer array attribute with fixed 2 elements.
-def XeGPU_IntArrayAttr2: ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<2>]>;
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
 def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
@@ -31,7 +30,7 @@ def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>;
 // common base class for types in XeGPU dialect
 class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Type">
-    : TypeDef<XeGPUDialect, name, traits, baseCppClass> {
+    : TypeDef<XeGPU_Dialect, name, traits, baseCppClass> {
   let mnemonic = typeMnemonic;
 }
 
@@ -108,13 +107,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     )>,
     TypeBuilder<(ins
       "llvm::ArrayRef<int64_t>": $shape, "mlir::Type": $elementType,
-      "mlir::xegpu::MemoryScope": $memory_scope, "int": $array_length,
+      "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length,
       "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered,
       "mlir::xegpu::SubGroupMapAttr": $mapping
     )>,
     TypeBuilderWithInferredContext<(ins
       "llvm::ArrayRef<int64_t>": $shape, "mlir::Type": $elementType,
-      "mlir::xegpu::MemoryScope": $memory_scope, "int": $array_length,
+      "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length,
       "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered,
       "mlir::xegpu::SubGroupMapAttr": $mapping
     )>
@@ -147,7 +146,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::dyn_cast_if_present<ScatteredAttr>(getEncoding());
     }
 
-    xegpu::MemoryScope getMemoryScope();
+    xegpu::MemoryScopeKind getMemoryScope();
     int getArrayLength();
     bool getBoundaryCheck();
     xegpu::ScatteredAttr getScattered();
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4a2eec6fde163..60ab50227c224 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -76,8 +76,7 @@ mlir::LogicalResult SubGroupMapAttr::verify(
 
 mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser,
                                       mlir::Type type) {
-
-  mlir::FailureOr<xegpu::MemoryScope> memory_scope;
+  mlir::FailureOr<xegpu::MemoryScopeKind> memory_scope;
   mlir::FailureOr<int> array_length;
   mlir::FailureOr<bool> boundary_check;
   mlir::FailureOr<xegpu::ScatteredAttr> scattered;
@@ -105,7 +104,7 @@ mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser,
         seen_memory_scope = true;
         // Parse variable 'memory_scope'
         memory_scope =
-            mlir::FieldParser<mlir::xegpu::MemoryScope>::parse(parser);
+            mlir::FieldParser<mlir::xegpu::MemoryScopeKind>::parse(parser);
         if (mlir::failed(memory_scope))
           return parser.emitError(
               parser.getCurrentLocation(),
@@ -157,7 +156,8 @@ mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser,
   if (parser.parseGreater())
     return {};
   return TensorDescAttr::get(
-      parser.getContext(), memory_scope.value_or(xegpu::MemoryScope::GLOBAL),
+      parser.getContext(),
+      memory_scope.value_or(xegpu::MemoryScopeKind::GLOBAL),
       array_length.value_or(1), boundary_check.value_or(true),
       scattered.value_or(xegpu::ScatteredAttr()),
       map.value_or(xegpu::SubGroupMapAttr()));
@@ -169,7 +169,7 @@ void TensorDescAttr::print(::mlir::AsmPrinter &printer) const {
 
   printer << "<";
 
-  if (printDefaults || getMemoryScope() != xegpu::MemoryScope::GLOBAL) {
+  if (printDefaults || getMemoryScope() != xegpu::MemoryScopeKind::GLOBAL) {
     if (printSep)
       printer << ", ";
     printSep = true;
@@ -208,7 +208,7 @@ void TensorDescAttr::print(::mlir::AsmPrinter &printer) const {
 
 bool TensorDescAttr::hasNonDefaultAttrs() {
   int count = 0;
-  if (getMemoryScope() != MemoryScope::GLOBAL)
+  if (getMemoryScope() != MemoryScopeKind::GLOBAL)
     count++;
   if (getBoundaryCheck() != true)
     count++;
@@ -222,7 +222,7 @@ bool TensorDescAttr::hasNonDefaultAttrs() {
 }
 
 TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
-                                   xegpu::MemoryScope memory_scope,
+                                   xegpu::MemoryScopeKind memory_scope,
                                    int array_length,
                                    xegpu::ScatteredAttr scattered,
                                    xegpu::SubGroupMapAttr map) {
@@ -286,12 +286,12 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   if (printDefaultValues()) {
     auto encoding = getEncoding();
     if (auto attr = getEncodingAsMapAttr()) {
-      encoding =
-          TensorDescAttr::get(getContext(), MemoryScope::GLOBAL, 1, {}, attr);
+      encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1,
+                                     {}, attr);
     }
     if (auto attr = getEncodingAsScatteredAttr()) {
-      encoding =
-          TensorDescAttr::get(getContext(), MemoryScope::GLOBAL, 1, attr, {});
+      encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1,
+                                     attr, {});
     }
     printer << ", " << encoding;
   } else if (auto encoding = getEncodingAsTensorDescAttr()) {
@@ -312,7 +312,7 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
 TensorDescType TensorDescType::get(mlir::MLIRContext *context,
                                    llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType,
-                                   mlir::xegpu::MemoryScope memory_scope,
+                                   mlir::xegpu::MemoryScopeKind memory_scope,
                                    int array_length, bool boundary_check,
                                    mlir::xegpu::ScatteredAttr scattered,
                                    mlir::xegpu::SubGroupMapAttr mapping) {
@@ -323,7 +323,7 @@ TensorDescType TensorDescType::get(mlir::MLIRContext *context,
 
 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
                                    mlir::Type elementType,
-                                   mlir::xegpu::MemoryScope memory_scope,
+                                   mlir::xegpu::MemoryScopeKind memory_scope,
                                    int array_length, bool boundary_check,
                                    mlir::xegpu::ScatteredAttr scattered,
                                    mlir::xegpu::SubGroupMapAttr mapping) {
@@ -333,12 +333,12 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
   return Base::get(elementType.getContext(), shape, elementType, attr);
 }
 
-xegpu::MemoryScope TensorDescType::getMemoryScope() {
+xegpu::MemoryScopeKind TensorDescType::getMemoryScope() {
   auto attr = getEncodingAsTensorDescAttr();
   if (attr)
     return attr.getMemoryScope();
   // return default value
-  return MemoryScope::GLOBAL;
+  return MemoryScopeKind::GLOBAL;
 }
 
 int TensorDescType::getArrayLength() {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a831e3bc2ae5d..627680e84ec94 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -23,27 +23,12 @@
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
+class Token;
+
 namespace xegpu {
 
 extern bool printDefaultValues();
 
-static size_t getRankOf(Value value) {
-  if (value.getType().isIntOrIndexOrFloat())
-    return 0;
-  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
-    return ty.getRank();
-  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
-    return ty.getRank();
-  llvm_unreachable("Unsupported value for getRankOf");
-}
-
-static void transpose(llvm::ArrayRef<int64_t> trans,
-                      std::vector<int64_t> &shape) {
-  std::vector<int64_t> old = shape;
-  for (size_t i = 0; i < trans.size(); i++)
-    shape[i] = old[trans[i]];
-};
-
 template <typename T>
 static std::string makeString(T array, bool breakline = false) {
   std::string buf;
@@ -60,121 +45,21 @@ static std::string makeString(T array, bool breakline = false) {
   return buf;
 }
 
-template <typename CustomEnum, typename CustomEnumAttr>
-static ParseResult parseCustomEnumAttr(OpAsmParser &parser,
-                                       OperationState &result,
-                                       llvm::StringRef attrKeyword) {
-  auto loc = parser.getCurrentLocation();
-  auto attrOptional = FieldParser<CustomEnum, CustomEnum>::parse(parser);
-  if (failed(attrOptional))
-    return parser.emitError(loc, "invalid attribute specification");
-  auto attr =
-      CustomEnumAttr::get(parser.getBuilder().getContext(), *attrOptional);
-  result.addAttribute(attrKeyword, attr);
-  return success();
-}
-
-template <typename AttrType>
-static ParseResult parseBoolAndIntegerAttr(OpAsmParser &parser,
-                                           OperationState &result,
-                                           llvm::StringRef attrKeyword) {
-  AttrType attr;
-  Type ty;
-
-  if (std::is_same<AttrType, BoolAttr>::value) {
-    ty = parser.getBuilder().getIntegerType(1);
-  } else if (std::is_same<AttrType, IntegerAttr>::value) {
-    ty = parser.getBuilder().getIntegerType(32);
-  } else if (std::is_same<AttrType, DenseI64ArrayAttr>::value) {
-    ty = Type{};
-  } else {
-    llvm_unreachable("Unsupported Attribute Type.");
-  }
-
-  if (parser.parseCustomAttributeWithFallback(attr, ty))
-    return failure();
-
-  if (attr)
-    result.addAttribute(attrKeyword, attr);
-  return success();
-};
-
-/// @brief Parsing optional attribute list which are enclosed in braces "{}",
-/// and seperated by comma
-/// @param parser
-/// @param result
-/// @param allowedKeywords
-/// @return
-static ParseResult
-parseOptionalAttrDict(OpAsmParser &parser, OperationState &result,
-                      llvm::ArrayRef<llvm::StringRef> allowedKeywords,
-                      bool isWrite = false) {
-  // no optional attributes, return success
-  if (failed(parser.parseOptionalLBrace()))
-    return success();
-
-  auto parseElt = [&]() -> ParseResult {
-    auto loc = parser.getCurrentLocation();
-    llvm::StringRef nameId;
-    if (parser.parseOptionalKeyword(&nameId, allowedKeywords))
-      return parser.emitError(loc, "invalid attribute keyword: ")
-             << nameId << ".\n";
-
-    if (parser.parseEqual())
-      return failure();
-
-    if (nameId == "l1_hint" || nameId == "l2_hint" || nameId == "l3_hint") {
-      if (isWrite)
-        return parseCustomEnumAttr<CacheWriteHint, CacheWriteHintAttr>(
-            parser, result, nameId);
-      else
-        return parseCustomEnumAttr<CacheReadHint, CacheReadHintAttr>(
-            parser, result, nameId);
-    }
-
-    if (nameId == "mode") {
-      return parseCustomEnumAttr<Mode, ModeAttr>(parser, result, nameId);
-    }
-
-    if (nameId == "chunk_size_per_lane" || nameId == "vnni_axis")
-      return parseBoolAndIntegerAttr<IntegerAttr>(parser, result, nameId);
-
-    if (nameId == "boundary_check")
-      return parseBoolAndIntegerAttr<BoolAttr>(parser, result, nameId);
-
-    if (nameId == "transpose")
-      return parseBoolAndIntegerAttr<DenseI64ArrayAttr>(parser, result, nameId);
-
-    llvm_unreachable("Unsupported attribute keyword.");
-  };
-
-  if (parser.parseCommaSeparatedList(parseElt))
-    return failure();
-
-  return parser.parseRBrace();
+static size_t getRankOf(Value value) {
+  if (value.getType().isIntOrIndexOrFloat())
+    return 0;
+  if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
+    return ty.getRank();
+  if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
+    return ty.getRank();
+  llvm_unreachable("Unsupported value for getRankOf");
 }
 
-template <typename T>
-static void printCacheHintAttrs(OpAsmPrinter &printer, T op, bool printSep) {
-  if (op.getL1HintAttr()) {
-    if (printSep)
-      printer << ", ";
-    printer << "l1_hint = " << op.getL1Hint().value();
-    printSep = true;
-  }
-
-  if (op.getL2HintAttr()) {
-    if (printSep)
-      printer << ", ";
-    printer << "l2_hint = " << op.getL2Hint().value();
-    printSep = true;
-  }
-
-  if (op.getL3HintAttr()) {
-    if (printSep)
-      printer << ", ";
-    printer << "l3_hint = " << op.getL3Hint().value();
-  }
+static void transpose(llvm::ArrayRef<int64_t> trans,
+                      std::vector<int64_t> &shape) {
+  std::vector<int64_t> old = shape;
+  for (size_t i = 0; i < trans.size(); i++)
+    shape[i] = old[trans[i]];
 }
 
 static bool verifyAndInferShape(std::vector<int64_t> &shape,
@@ -202,22 +87,94 @@ static bool verifyAndInferShape(std::vector<int64_t> &shape,
   return true;
 }
 
-/// @brief the base builder for CreateNdDescOp
-/// @param builder, the mlir OpBuilder
-/// @param state , the mlir OperationState
-/// @param TensorDesc, the TensorDescType of the result
-/// @param source, the base address of the data. It can be either 2D memref
-/// object or simple integer value (pointer)
-/// @param offsets, the dynamic offset given as Value
-/// @param shape, the dynamic shape given as array of Values
-/// @param strides, the dynamic shape given as array of Values
-/// @param static_offsets, the static offset. If it is not used it should be
-/// filled with ShapeType::kDynamic
-/// @param mode, VC or SIMT
+static ParseResult
+parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser,
+                                     OperationState &result) {
+  // no optional attributes, return success
+  if (failed(parser.parseOptionalLBrace()))
+    return success();
+
+  llvm::SmallDenseSet<StringRef, 8> seenKeys;
+  auto parseElt = [&]() -> ParseResult {
+    // The name of an attribute can either be a keyword, or a string.
+    // as compared to mlir::parseOptionalAttrList, the cases of using
+    // TOken::bare_identifier and Token::inttype as key maybe not handlered
+    std::string nameId;
+    auto loc = parser.getCurrentLocation();
+    if (parser.parseOptionalKeywordOrString(&nameId))
+      return parser.emitError(loc, "invalid attribute name: ")
+             << nameId << ".\n";
+
+    if (nameId.empty())
+      return parser.emitError(loc, "expected valid attribute name");
+
+    if (!seenKeys.insert(nameId).second)
+      return parser.emitError(loc, "duplicate key '")
+             << nameId << "' in dictionary attribute.";
+
+    // Lazy load a dialect in the context if there is a possible namespace.
+    auto splitName = StringRef(nameId).split('.');
+    if (!splitName.second.empty())
+      parser.getContext()->getOrLoadDialect(splitName.first);
+
+    // Try to parse the '=' for the attribute value.
+    if (parser.parseEqual()) {
+      // If there is no '=', it is treated as a unit attribute.
+      result.addAttribute(nameId, parser.getBuilder().getUnitAttr());
+      return success();
+    }
+
+    // for xegpu specific attributes
+    if (nameId == "mode") {
+      ModeKindAttr attr;
+      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+                                                     result.attributes);
+    } else if (nameId == "l1_hint" || nameId == "l2_hint" ||
+               nameId == "l3_hint") {
+      CacheKindAttr attr;
+      return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+                                                     result.attributes);
+    } else if (nameId == "transpose") {
+      // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse()
+      if (succeeded(parser.parseOptionalLSquare())) {
+        Attribute attr;
+        // handle empty list case
+        if (succeeded(parser.parseOptionalRSquare())) {
+          attr = DenseI64ArrayAttr::get(parser.getContext(), {});
+        } else {
+          attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{});
+          if (failed(parser.parseRSquare()))
+            return failure();
+        }
+        if (!attr)
+          return failure();
+        result.addAttribute(nameId, attr);
+        return success();
+      } else {
+        // in form of array<i64: 4, 5>
+        DenseI64ArrayAttr attr;
+        return parser.parseAttribute(attr, nameId, result.attributes);
+      }
+    } else {
+      Attribute attr;
+      return parser.parseAttribute(attr, nameId, result.attributes);
+    }
+  };
+
+  if (parser.parseCommaSeparatedList(parseElt))
+    return failure();
+
+  return parser.parseRBrace();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type TensorDesc, Value source, ValueRange offsets,
                            ValueRange shape, ValueRange strides,
-                           llvm::ArrayRef<int64_t> static_offsets, Mode mode) {
+                           llvm::ArrayRef<int64_t> static_offsets,
+                           ModeKind mode) {
   auto offsetRank = static_offsets.size();
   auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
 
@@ -243,13 +200,14 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   state.addAttribute(getStaticOffsetsAttrName(state.name),
                      builder.getDenseI64ArrayAttr(static_offsets));
   state.addAttribute(getModeAttrName(state.name),
-                     xegpu::ModeAttr::get(builder.getContext(), mode));
+                     xegpu::ModeKindAttr::get(builder.getContext(), mode));
   state.addTypes(TensorDesc);
 }
 
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, Value source,
-                           llvm::ArrayRef<OpFoldResult> offsets, Mode mode) {
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           ModeKind mode) {
   auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
   assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
 
@@ -266,8 +224,7 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, Value source,
                            llvm::ArrayRef<OpFoldResult> offsets,
-                           ValueRange shape, ValueRange stride,
-                           xegpu::Mode mode) {
+                           ValueRange shape, ValueRange stride, ModeKind mode) {
   assert(shape.size() && offsets.size() && stride.size() &&
          shape.size() == stride.size() && shape.size() == offsets.size());
 
@@ -283,11 +240,9 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
 
 ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
   // parse the source operand
-  OpAsmParser::UnresolvedOperand sourceRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> sourceOperands(
-      sourceRawOperands);
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> sourceOperands(1);
   llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(sourceRawOperands[0]))
+  if (parser.parseOperand(sourceOperands[0]))
     return failure();
 
   // parse the offset operand, in format of [x, y]
@@ -328,23 +283,28 @@ ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
       return failure();
   }
 
-  if (parseOptionalAttrDict(parser, result, {"boundary_check", "mode"}))
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
   if (parser.parseColon())
     return failure();
 
-  Type sourceRawTypes[1];
-  llvm::ArrayRef<Type> sourceTypes(sourceRawTypes);
-  if (parser.parseType(sourceRawTypes[0]))
+  llvm::SmallVector<Type> sourceTypes(1);
+  if (parser.parseType(sourceTypes[0]))
     return failure();
 
   if (parser.parseArrow())
     return failure();
 
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-  if (parser.parseType(TensorDescRawTypes[0]))
+  llvm::SmallVector<Type> TensorDescTypes(1);
+  if (parser.parseType(TensorDescTypes[0]))
     return failure();
   result.addAttribute("operandSegmentSizes",
                       parser.getBuilder().getDenseI32ArrayAttr(
@@ -352,11 +312,12 @@ ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
                            static_cast<int32_t>(shapeOperands.size()),
                            static_cast<int32_t>(stridesOperands.size())}));
 
-  Type indexType = parser.getBuilder().getIndexType();
   result.addTypes(TensorDescTypes);
   if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc,
                              result.operands))
     return failure();
+
+  Type indexType = parser.getBuilder().getIndexType();
   if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
                              result.operands))
     return failure();
@@ -391,11 +352,13 @@ void CreateNdDescOp::print(OpAsmPrinter &printer) {
     printer << "]";
   }
 
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << ' ' << "{";
-    printer << "mode = " << mode;
-    printer << "}";
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  elidedAttrs.push_back("static_offsets");
+  elidedAttrs.push_back("operandSegmentSizes");
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
@@ -415,12 +378,12 @@ LogicalResult CreateNdDescOp::verify() {
                        "non-scattered operators.\n");
   }
 
-  if (mode == Mode::VC && mapping) {
+  if (mode == ModeKind::VC && mapping) {
     return emitOpError("Mapping attribute of TensorDesc is not expected "
                        "for VC mode operations.\n");
   }
 
-  if (mode == Mode::SIMT && !mapping) {
+  if (mode == ModeKind::SIMT && !mapping) {
     return emitOpError("Expecting SgMap attribute for SIMT mode operators.\n");
   }
 
@@ -494,8 +457,8 @@ llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
     return shape;
   }
 
-  emitOpError("The shape information is missing.");
-  llvm_unreachable("Unexpected error in CreateNdDescOp.\n");
+  llvm_unreachable("Unexpected error in CreateNdDescOp. "
+                   "The shape information is missing.\n");
 }
 
 llvm::ArrayRef<int64_t> CreateNdDescOp::getStaticStrides() {
@@ -541,199 +504,42 @@ llvm::ArrayRef<int64_t> CreateNdDescOp::getTensorDescShape() {
   return getTensorDescType().getShape();
 }
 
-ParseResult CreateDescOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand sourceRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> sourceOperands(
-      sourceRawOperands);
-  llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(sourceRawOperands[0]))
-    return failure();
-
-  if (parser.parseComma())
-    return failure();
-
-  OpAsmParser::UnresolvedOperand offsetsRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> offsetsOperands(
-      offsetsRawOperands);
-  llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(offsetsRawOperands[0]))
-    return failure();
-
-  if (parseOptionalAttrDict(parser, result, {"chunk_size_per_lane", "mode"}))
-    return failure();
-
-  if (parser.parseColon())
-    return failure();
-
-  Type sourceRawTypes[1];
-  llvm::ArrayRef<Type> sourceTypes(sourceRawTypes);
-  if (parser.parseType(sourceRawTypes[0]))
-    return failure();
-  if (parser.parseComma())
-    return failure();
-
-  Type offsetsRawTypes[1];
-  llvm::ArrayRef<Type> offsetsTypes(offsetsRawTypes);
-  if (parser.parseType(offsetsRawTypes[0]))
-    return failure();
-  if (parser.parseArrow())
-    return failure();
-
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-  if (parser.parseType(TensorDescRawTypes[0]))
-    return failure();
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNDOp
+//===----------------------------------------------------------------------===//
 
-  result.addTypes(TensorDescTypes);
-  if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc,
-                             result.operands))
-    return failure();
-  if (parser.resolveOperands(offsetsOperands, offsetsTypes, offsetsOperandsLoc,
-                             result.operands))
+ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(1);
+  llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperand(Operands[0]))
     return failure();
-  return success();
-}
-
-void CreateDescOp::print(OpAsmPrinter &printer) {
-  auto mode = getMode();
-  bool printSep = false;
-  auto chunk = getChunkSizePerLane();
-  auto printDefaults = printDefaultValues();
-
-  printer << ' ';
-  printer << getSource();
-  printer << ",";
-  printer << ' ';
-  printer << getOffsets();
-
-  if (printDefaults || mode != Mode::SIMT || chunk != 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << mode;
-    printSep = true;
-  }
-
-  if (printDefaults || chunk != 1) {
-    if (printSep)
-      printer << "," << ' ';
-    printer << "chunk_size_per_lane = " << chunk;
-  }
-
-  if (printDefaults || mode != Mode::SIMT || chunk != 1) {
-    printer << "}";
-  }
-
-  printer << ' ' << ":";
-  printer << ' ';
-  printer << getSource().getType();
-  printer << ",";
-  printer << ' ';
-  printer << getOffsets().getType();
-  printer << ' ' << "->";
-  printer << ' ';
-  printer << getTensorDesc().getType();
-}
-
-LogicalResult CreateDescOp::verify() {
-  auto mode = getMode();
-  auto mapping = getTensorDesc().getType().getMapping();
-  auto offsetTy = getOffsets().getType();
-  auto tdescTy = getTensorDesc().getType();
-  auto chunkSize = getChunkSizePerLane();
-
-  if (mode == Mode::SIMT || mapping) {
-    return emitOpError("CreateDescOp only support VC mode and mapping "
-                       "attribute of TensorDesc is not expected.\n");
-  }
-
-  if (getRankOf(getSource()) > 2)
-    return emitOpError(
-        "Expecting the source is a 1D/2D memref or pointer (uint64_t).");
-
-  if (!tdescTy.getScattered())
-    return emitOpError(
-        "Expecting the presence of ScatteredAttr for tensor descriptor.");
 
-  // Infer the TensorDesc shape
-  std::vector<int64_t> shape;
-  if (llvm::isa<VectorType>(offsetTy)) {
-    shape = llvm::dyn_cast<VectorType>(offsetTy).getShape().vec();
-    if (shape.size() != 1)
-      return emitOpError("Expecting the offset is a 1D vector.");
-  }
-
-  if (chunkSize != 1) {
-    shape.push_back(chunkSize);
-  }
-
-  auto tdescShape = tdescTy.getShape();
-  if (shape != tdescShape.vec()) {
-    return emitOpError("Expecting dimensions of offsets is the same as the "
-                       "tensor descriptor, or one less than.");
-  }
-
-  return success();
-}
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
-                         TensorDescType TensorDesc, Value source, Value offsets,
-                         uint32_t chunk_size_per_lane) {
-  state.addOperands(source);
-  state.addOperands(offsets);
-  state.getOrAddProperties<Properties>().chunk_size_per_lane =
-      builder.getIntegerAttr(builder.getIntegerType(32), chunk_size_per_lane);
-  state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
-  state.addTypes(TensorDesc);
-}
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
-                         TensorDescType TensorDesc, Value source, Value offsets,
-                         IntegerAttr chunk_size_per_lane) {
-  state.addOperands(source);
-  state.addOperands(offsets);
-  if (chunk_size_per_lane)
-    state.getOrAddProperties<Properties>().chunk_size_per_lane =
-        chunk_size_per_lane;
-  state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
-  state.addTypes(TensorDesc);
-}
-
-ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
-  llvm::SMLoc TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
     return failure();
 
-  if (parseOptionalAttrDict(
-          parser, result,
-          {"mode", "vnni_axis", "transpose", "l1_hint", "l2_hint", "l3_hint"}))
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
   if (parser.parseColon())
     return failure();
 
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-  if (parser.parseType(TensorDescRawTypes[0]))
+  llvm::SmallVector<Type> Types(1);
+  if (parser.parseType(Types[0]))
     return failure();
 
   if (parser.parseArrow())
     return failure();
 
-  Type valueRawTypes[1];
-  llvm::ArrayRef<Type> valueTypes(valueRawTypes);
-  if (parser.parseType(valueRawTypes[0]))
+  llvm::SmallVector<Type> valueTypes(1);
+  if (parser.parseType(valueTypes[0]))
     return failure();
 
   result.addTypes(valueTypes);
-  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
-                             TensorDescOperandsLoc, result.operands))
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
     return failure();
 
   return success();
@@ -741,42 +547,16 @@ ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) {
 
 void LoadNDOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  bool printSep = false;
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
 
   printer << ' ';
   printer << getTensorDesc();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << mode;
-    printSep = true;
-  }
-
-  if (getVnniAxisAttr()) {
-    if (printSep)
-      printer << "," << ' ';
-    printer << "vnni_axis = " << getVnniAxis().value();
-    printSep = true;
-  }
-
-  if (getTransposeAttr()) {
-    if (printSep)
-      printer << "," << ' ';
-    printer << "transpose = ";
-    getTransposeAttr().print(printer);
-    printSep = true;
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
 
-  printCacheHintAttrs<LoadNDOp>(printer, *this, printSep);
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
-  }
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
@@ -809,7 +589,7 @@ LogicalResult LoadNDOp::verify() {
   auto valueShape = valueTy.getShape().vec();
   auto array_len = tdescTy.getArrayLength();
 
-  if (mode == Mode::SIMT) {
+  if (mode == ModeKind::SIMT) {
     auto sgMap = tdescTy.getMapping();
     if (!sgMap) {
       return emitOpError(
@@ -864,50 +644,42 @@ LogicalResult LoadNDOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNDOp
+//===----------------------------------------------------------------------===//
 ParseResult StoreNDOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand valueRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> valueOperands(
-      valueRawOperands);
-  llvm::SMLoc valueOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(valueRawOperands[0]))
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+  llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+  // parse value
+  if (parser.parseOperand(Operands[0]))
     return failure();
 
   if (parser.parseComma())
     return failure();
 
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
-  llvm::SMLoc TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+  // parse TensorDesc
+  if (parser.parseOperand(Operands[1]))
     return failure();
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"mode", "l1_hint", "l2_hint", "l3_hint"}, true))
+  // parse optional attributes
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
     return failure();
 
-  if (parser.parseColon())
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
-  Type valueRawTypes[1];
-  llvm::ArrayRef<Type> valueTypes(valueRawTypes);
-  if (parser.parseType(valueRawTypes[0]))
+  if (parser.parseColon())
     return failure();
 
-  if (parser.parseComma())
+  llvm::SmallVector<Type> Types;
+  if (parser.parseTypeList(Types))
     return failure();
 
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-  if (parser.parseType(TensorDescRawTypes[0]))
-    return failure();
-
-  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
-                             TensorDescOperandsLoc, result.operands))
-    return failure();
-
-  if (parser.resolveOperands(valueOperands, valueTypes, valueOperandsLoc,
-                             result.operands))
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
     return failure();
 
   return success();
@@ -915,9 +687,7 @@ ParseResult StoreNDOp::parse(OpAsmParser &parser, OperationState &result) {
 
 void StoreNDOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  [[maybe_unused]] bool printSep = false;
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
 
   printer << ' ';
   printer << getValue();
@@ -925,20 +695,10 @@ void StoreNDOp::print(OpAsmPrinter &printer) {
   printer << ' ';
   printer << getTensorDesc();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << getMode();
-    printSep = true;
-  }
-
-  printCacheHintAttrs<StoreNDOp>(printer, *this, true);
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
@@ -969,7 +729,7 @@ LogicalResult StoreNDOp::verify() {
 
   auto mode = getMode();
 
-  if (mode == Mode::VC) { // for VC mode, no attr attached
+  if (mode == ModeKind::VC) { // for VC mode, no attr attached
     if (dstTy.getShape() != valTy.getShape())
       return emitOpError("In VC mode, the value (vector) shape doesn't match "
                          "the memory (dst) shape.\n");
@@ -1004,26 +764,32 @@ LogicalResult StoreNDOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchNDOp
+//===----------------------------------------------------------------------===//
 ParseResult PrefetchNDOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+  llvm::SmallVector<Type> TensorDescTypes(1);
   llvm::SMLoc TensorDescOperandsLoc;
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
 
   TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+  if (parser.parseOperand(TensorDescOperands[0]))
+    return failure();
+
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
     return failure();
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"mode", "l1_hint", "l2_hint", "l3_hint"}))
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
   if (parser.parseColon())
     return failure();
 
-  if (parser.parseType(TensorDescRawTypes[0]))
+  if (parser.parseType(TensorDescTypes[0]))
     return failure();
   if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
                              TensorDescOperandsLoc, result.operands))
@@ -1033,255 +799,270 @@ ParseResult PrefetchNDOp::parse(OpAsmParser &parser, OperationState &result) {
 
 void PrefetchNDOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  [[maybe_unused]] bool printSep = false;
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
+
   printer << ' ';
   printer << getTensorDesc();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << getMode();
-    printSep = true;
-  }
-
-  printCacheHintAttrs<PrefetchNDOp>(printer, *this, true);
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
   printer << getTensorDesc().getType();
 }
 
-LogicalResult DpasOp::verify() {
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateNDOffsetOp
+//===----------------------------------------------------------------------===//
+ParseResult UpdateNDOffsetOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> offsetsOperands;
+  llvm::SmallVector<Type> TensorDescTypes(1);
+  llvm::SmallVector<Type> resultTypes(1);
+  llvm::SMLoc TensorDescOperandsLoc;
+  llvm::SMLoc offsetsOperandsLoc;
 
-  int64_t lhsRank = getLhsType().getRank();
-  int64_t rhsRank = getRhsType().getRank();
-  Type lhsElemType = getLhsType().getElementType();
-  Type rhsElemType = getRhsType().getElementType();
+  TensorDescOperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperand(TensorDescOperands[0]))
+    return failure();
+  if (parser.parseComma())
+    return failure();
 
-  if (lhsElemType != rhsElemType) {
-    return emitOpError("lhs and rhs element type does not match for dpas op");
+  // parse offsets, e.g.,  [x, y]
+  if (succeeded(parser.parseOptionalLSquare())) {
+    offsetsOperandsLoc = parser.getCurrentLocation();
+    if (parser.parseOperandList(offsetsOperands))
+      return failure();
+    if (parser.parseRSquare())
+      return failure();
   }
 
-  if (getAcc() && getAccType() != getResultType()) {
-    return emitOpError("Accumulator and Result for dpas op should have the "
-                       "same type (both shape and element type).");
-  }
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
 
-  if (lhsRank != rhsRank || lhsRank != 3) {
-    return emitOpError(
-        "lhs and rhs rank does not match for dpas op, or their rank is not 3.");
+  auto loc = parser.getCurrentLocation();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseType(TensorDescTypes[0]))
+    return failure();
+  if (parser.parseArrow())
+    return failure();
+
+  if (parser.parseType(resultTypes[0]))
+    return failure();
+  result.addTypes(resultTypes);
+  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
+                             TensorDescOperandsLoc, result.operands))
+    return failure();
+
+  Type indexType = parser.getBuilder().getIndexType();
+  if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
+                             result.operands))
+    return failure();
+  return success();
+}
+
+void UpdateNDOffsetOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+
+  printer << ' ';
+  printer << getTensorDesc();
+  printer << ",";
+  if (!getOffsets().empty()) {
+    printer << ' ' << "[";
+    printer << getOffsets();
+    printer << "]";
   }
 
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getTensorDesc().getType();
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getResult().getType();
+}
+
+LogicalResult UpdateNDOffsetOp::verify() {
+  // number of offsets specified must match the rank of the tensor descriptor
+  if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) {
+    return emitOpError("Invalid number of offsets.");
+  }
   return success();
 }
 
-ParseResult LoadGatherOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
-  llvm::SMLoc TensorDescOperandsLoc;
-  OpAsmParser::UnresolvedOperand maskRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> maskOperands(maskRawOperands);
-  llvm::SMLoc maskOperandsLoc;
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateDescOp
+//===----------------------------------------------------------------------===//
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+                         TensorDescType TensorDesc, Value source, Value offsets,
+                         uint32_t chunk_size_per_lane) {
+  state.addOperands(source);
+  state.addOperands(offsets);
+  state.getOrAddProperties<Properties>().chunk_size_per_lane =
+      builder.getIntegerAttr(builder.getIntegerType(32), chunk_size_per_lane);
+  state.getOrAddProperties<Properties>().mode =
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+  state.addTypes(TensorDesc);
+}
 
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-  Type maskRawTypes[1];
-  llvm::ArrayRef<Type> maskTypes(maskRawTypes);
-  Type valueRawTypes[1];
-  llvm::ArrayRef<Type> valueTypes(valueRawTypes);
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+                         TensorDescType TensorDesc, Value source, Value offsets,
+                         IntegerAttr chunk_size_per_lane) {
+  state.addOperands(source);
+  state.addOperands(offsets);
+  if (chunk_size_per_lane)
+    state.getOrAddProperties<Properties>().chunk_size_per_lane =
+        chunk_size_per_lane;
+  state.getOrAddProperties<Properties>().mode =
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+  state.addTypes(TensorDesc);
+}
 
-  TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+ParseResult CreateDescOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+  llvm::SmallVector<Type> Types(2);
+  llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+  // parse the source operand
+  if (parser.parseOperand(Operands[0]))
     return failure();
 
   if (parser.parseComma())
     return failure();
 
-  maskOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(maskRawOperands[0]))
+  // parse the offset operand
+  if (parser.parseOperand(Operands[1]))
+    return failure();
+
+  // parse the optional attributes
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
     return failure();
 
-  if (parseOptionalAttrDict(
-          parser, result,
-          {"mode", "vnni_axis", "transpose", "l1_hint", "l2_hint", "l3_hint"}))
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
   if (parser.parseColon())
     return failure();
 
-  if (parser.parseType(TensorDescRawTypes[0]))
+  if (parser.parseType(Types[0]))
     return failure();
-
   if (parser.parseComma())
     return failure();
 
-  if (parser.parseType(maskRawTypes[0]))
+  if (parser.parseType(Types[1]))
     return failure();
-
   if (parser.parseArrow())
     return failure();
 
-  if (parser.parseType(valueRawTypes[0]))
-    return failure();
-
-  result.addTypes(valueTypes);
-
-  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
-                             TensorDescOperandsLoc, result.operands))
+  llvm::SmallVector<Type> TensorDescTypes(1);
+  if (parser.parseType(TensorDescTypes[0]))
     return failure();
 
-  if (parser.resolveOperands(maskOperands, maskTypes, maskOperandsLoc,
-                             result.operands))
+  result.addTypes(TensorDescTypes);
+  if (parser.resolveOperands(Operands, Types, operandsLoc, result.operands))
     return failure();
   return success();
 }
 
-void LoadGatherOp::print(OpAsmPrinter &printer) {
+void CreateDescOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  bool printSep = false;
+  auto chunk = getChunkSizePerLane();
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
 
   printer << ' ';
-  printer << getTensorDesc();
+  printer << getSource();
   printer << ",";
   printer << ' ';
-  printer << getMask();
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << getMode();
-    printSep = true;
-  }
-
-  if (getVnniAxisAttr()) {
-    if (printSep)
-      printer << "," << ' ';
-    printer << "vnni_axis = " << getVnniAxis().value();
-    printSep = true;
-  }
-
-  if (getTransposeAttr()) {
-    if (printSep)
-      printer << "," << ' ';
-    printer << "transpose = ";
-    getTransposeAttr().print(printer);
-    printSep = true;
-  }
-
-  printCacheHintAttrs<LoadGatherOp>(printer, *this, printSep);
+  printer << getOffsets();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults) {
+    if (mode == xegpu::ModeKind::SIMT)
+      elidedAttrs.push_back("mode");
+    if (chunk == 1)
+      elidedAttrs.push_back("chunk_size_per_lane");
   }
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
-  printer << getTensorDesc().getType();
+  printer << getSource().getType();
   printer << ",";
   printer << ' ';
-  printer << getMask().getType();
+  printer << getOffsets().getType();
   printer << ' ' << "->";
   printer << ' ';
-  printer << getValue().getType();
+  printer << getTensorDesc().getType();
 }
 
-LogicalResult LoadGatherOp::verify() {
+LogicalResult CreateDescOp::verify() {
+  auto mode = getMode();
+  auto mapping = getTensorDesc().getType().getMapping();
+  auto offsetTy = getOffsets().getType();
   auto tdescTy = getTensorDesc().getType();
-  auto maskTy = getMask().getType();
-  auto valueTy = getValue().getType();
-
-  if (!tdescTy.getScattered())
-    return emitOpError(
-        "LoadGatherOp only works on TensorDesc with ScatteredAttr.");
+  auto chunkSize = getChunkSizePerLane();
 
-  auto getElementType = [&](Type type) -> Type {
-    if (type.isIntOrIndexOrFloat())
-      return type;
-    else if (llvm::isa<VectorType>(type))
-      return llvm::dyn_cast<VectorType>(type).getElementType();
-    else if (llvm::isa<TensorDescType>(type))
-      return llvm::dyn_cast<TensorDescType>(type).getElementType();
-    llvm_unreachable("Unsupported type.");
-    return type;
-  };
+  if (mode == ModeKind::SIMT || mapping) {
+    return emitOpError("CreateDescOp only support VC mode and mapping "
+                       "attribute of TensorDesc is not expected.\n");
+  }
 
-  auto tdescElemTy = getElementType(tdescTy);
-  auto valueElemTy = getElementType(valueTy);
-  if (tdescElemTy != valueElemTy)
+  if (getRankOf(getSource()) > 2)
     return emitOpError(
-        "Value should have the same element type as TensorDesc.");
-
-  auto getShape = [&](Type type) -> std::vector<int64_t> {
-    std::vector<int64_t> shape;
-    if (type.isIntOrIndexOrFloat())
-      shape.push_back(1);
-    else if (llvm::isa<VectorType>(type))
-      shape = llvm::dyn_cast<VectorType>(type).getShape().vec();
-    else
-      llvm_unreachable("Unsupported type.");
-    return shape;
-  };
-
-  std::vector<int64_t> maskShape = getShape(maskTy);
-  std::vector<int64_t> valueShape = getShape(valueTy);
-  std::vector<int64_t> tdescShape = tdescTy.getShape().vec();
+        "Expecting the source is a 1D/2D memref or pointer (uint64_t).");
 
-  if (tdescShape != maskShape)
-    return emitOpError("Mask should have the same shape as TensorDesc.");
+  if (!tdescTy.getScattered())
+    return emitOpError(
+        "Expecting the presence of ScatteredAttr for tensor descriptor.");
 
-  auto mode = getMode();
-  auto mapping = tdescTy.getMapping();
-  if (mode == Mode::SIMT || mapping) {
-    return emitOpError("LoadGatherOp only supports VC mode and mapping "
-                       "attribute of TensorDesc is not expected.\n");
+  // Infer the TensorDesc shape
+  std::vector<int64_t> shape;
+  if (llvm::isa<VectorType>(offsetTy)) {
+    shape = llvm::dyn_cast<VectorType>(offsetTy).getShape().vec();
+    if (shape.size() != 1)
+      return emitOpError("Expecting the offset is a 1D vector.");
   }
 
-  if (getTranspose()) {
-    auto trans = getTranspose().value();
-    if (tdescShape.size() >= trans.size())
-      transpose(trans, tdescShape);
-    else
-      emitWarning("Invalid transpose attr. It is ignored.");
+  if (chunkSize != 1) {
+    shape.push_back(chunkSize);
   }
 
-  if (getVnniAxis()) {
-    auto axis = getVnniAxis().value();
-    auto vnni_factor = valueShape.back();
-    tdescShape[axis] /= vnni_factor;
-    tdescShape.push_back(vnni_factor);
+  auto tdescShape = tdescTy.getShape();
+  if (shape != tdescShape.vec()) {
+    return emitOpError("Expecting dimensions of offsets is the same as the "
+                       "tensor descriptor, or one less than.");
   }
 
-  if (valueShape != tdescShape)
-    return emitOpError(
-        "Result shape doesn't match TensorDesc shape. when VNNI is not enabled,"
-        "the result should have the same shape (or transposed shape if "
-        "transpose is also enabled) as TensorDesc. When VNNI is enabled, "
-        "the result should have one more dimention than the TensorDesc, "
-        "with last dimention having vnni factor, but having same number of"
-        "total data elements. The vnni factor are typically calculated as "
-        "simd_lane_width/elementTypeBitWidth. For element type having "
-        "more than 32 bits, vnni shouldn't be used.\n");
-
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadGatherOp
+//===----------------------------------------------------------------------===//
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
                          Value TensorDesc, Value mask, IntegerAttr vnni_axis,
-                         DenseI64ArrayAttr transpose, CacheReadHintAttr l1_hint,
-                         CacheReadHintAttr l2_hint, CacheReadHintAttr l3_hint) {
+                         DenseI64ArrayAttr transpose, CacheKindAttr l1_hint,
+                         CacheKindAttr l2_hint, CacheKindAttr l3_hint) {
   state.addOperands(TensorDesc);
   state.addOperands(mask);
   if (vnni_axis)
@@ -1300,14 +1081,14 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
     state.getOrAddProperties<Properties>().l3_hint = l3_hint;
 
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
   state.addTypes(value);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
                          Value TensorDesc, Value mask, IntegerAttr vnni_axis,
-                         DenseI64ArrayAttr transpose, CacheReadHint l1_hint,
-                         CacheReadHint l2_hint, CacheReadHint l3_hint) {
+                         DenseI64ArrayAttr transpose, CacheKind l1_hint,
+                         CacheKind l2_hint, CacheKind l3_hint) {
   state.addOperands(TensorDesc);
   state.addOperands(mask);
   if (vnni_axis)
@@ -1317,99 +1098,244 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
     state.getOrAddProperties<Properties>().transpose = transpose;
 
   state.getOrAddProperties<Properties>().l1_hint =
-      CacheReadHintAttr::get(builder.getContext(), l1_hint);
+      CacheKindAttr::get(builder.getContext(), l1_hint);
   state.getOrAddProperties<Properties>().l2_hint =
-      CacheReadHintAttr::get(builder.getContext(), l2_hint);
+      CacheKindAttr::get(builder.getContext(), l2_hint);
   state.getOrAddProperties<Properties>().l3_hint =
-      CacheReadHintAttr::get(builder.getContext(), l3_hint);
+      CacheKindAttr::get(builder.getContext(), l3_hint);
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
   state.addTypes(value);
 }
 
-ParseResult StoreScatterOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
-  llvm::SMLoc TensorDescOperandsLoc;
-
-  OpAsmParser::UnresolvedOperand valueRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> valueOperands(
-      valueRawOperands);
-  llvm::SMLoc valueOperandsLoc;
-
-  OpAsmParser::UnresolvedOperand maskRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> maskOperands(maskRawOperands);
-  llvm::SMLoc maskOperandsLoc;
-
-  Type valueRawTypes[1];
-  llvm::ArrayRef<Type> valueTypes(valueRawTypes);
-
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
-
-  Type maskRawTypes[1];
-  llvm::ArrayRef<Type> maskTypes(maskRawTypes);
+ParseResult LoadGatherOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+  llvm::SmallVector<Type> Types(2);
+  llvm::SmallVector<Type> valueTypes(1);
+  llvm::SMLoc OperandsLoc;
 
-  valueOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(valueRawOperands[0]))
+  OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperand(Operands[0]))
     return failure();
 
   if (parser.parseComma())
     return failure();
 
-  TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+  if (parser.parseOperand(Operands[1]))
     return failure();
 
-  if (parser.parseComma())
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
-  maskOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(maskRawOperands[0]))
+  if (parser.parseColon())
     return failure();
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"mode", "l1_hint", "l2_hint", "l3_hint"}, true))
+  if (parser.parseType(Types[0]))
     return failure();
 
-  if (parser.parseColon())
+  if (parser.parseComma())
     return failure();
 
-  if (parser.parseType(valueRawTypes[0]))
+  if (parser.parseType(Types[1]))
     return failure();
 
-  if (parser.parseComma())
+  if (parser.parseArrow())
     return failure();
 
-  if (parser.parseType(TensorDescRawTypes[0]))
+  if (parser.parseType(valueTypes[0]))
     return failure();
 
-  if (parser.parseComma())
+  result.addTypes(valueTypes);
+
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
     return failure();
 
-  if (parser.parseType(maskRawTypes[0]))
+  return success();
+}
+
+void LoadGatherOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+
+  printer << ' ';
+  printer << getTensorDesc();
+  printer << ",";
+  printer << ' ';
+  printer << getMask();
+
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getTensorDesc().getType();
+  printer << ",";
+  printer << ' ';
+  printer << getMask().getType();
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getValue().getType();
+}
+
+LogicalResult LoadGatherOp::verify() {
+  auto tdescTy = getTensorDesc().getType();
+  auto maskTy = getMask().getType();
+  auto valueTy = getValue().getType();
+
+  if (!tdescTy.getScattered())
+    return emitOpError(
+        "LoadGatherOp only works on TensorDesc with ScatteredAttr.");
+
+  auto getElementType = [&](Type type) -> Type {
+    if (type.isIntOrIndexOrFloat())
+      return type;
+    else if (llvm::isa<VectorType>(type))
+      return llvm::dyn_cast<VectorType>(type).getElementType();
+    else if (llvm::isa<TensorDescType>(type))
+      return llvm::dyn_cast<TensorDescType>(type).getElementType();
+    llvm_unreachable("Unsupported type.");
+    return type;
+  };
+
+  auto tdescElemTy = getElementType(tdescTy);
+  auto valueElemTy = getElementType(valueTy);
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto getShape = [&](Type type) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    if (type.isIntOrIndexOrFloat())
+      shape.push_back(1);
+    else if (llvm::isa<VectorType>(type))
+      shape = llvm::dyn_cast<VectorType>(type).getShape().vec();
+    else
+      llvm_unreachable("Unsupported type.");
+    return shape;
+  };
+
+  std::vector<int64_t> maskShape = getShape(maskTy);
+  std::vector<int64_t> valueShape = getShape(valueTy);
+  std::vector<int64_t> tdescShape = tdescTy.getShape().vec();
+
+  if (tdescShape != maskShape)
+    return emitOpError("Mask should have the same shape as TensorDesc.");
+
+  auto mode = getMode();
+  auto mapping = tdescTy.getMapping();
+  if (mode == ModeKind::SIMT || mapping) {
+    return emitOpError("LoadGatherOp only supports VC mode and mapping "
+                       "attribute of TensorDesc is not expected.\n");
+  }
+
+  if (getTransposeAttr()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() < trans.size())
+      return emitWarning("Invalid transpose attr. It is ignored.");
+    transpose(trans, tdescShape);
+  }
+
+  if (getVnniAxis()) {
+    auto axis = getVnniAxis().value();
+    auto vnni_factor = valueShape.back();
+    tdescShape[axis] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
+  if (valueShape != tdescShape)
+    return emitOpError(
+        "Result shape doesn't match TensorDesc shape. when VNNI is not enabled,"
+        "the result should have the same shape (or transposed shape if "
+        "transpose is also enabled) as TensorDesc. When VNNI is enabled, "
+        "the result should have one more dimention than the TensorDesc, "
+        "with last dimention having vnni factor, but having same number of"
+        "total data elements. The vnni factor are typically calculated as "
+        "simd_lane_width/elementTypeBitWidth. For element type having "
+        "more than 32 bits, vnni shouldn't be used.\n");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreScatterOp
+//===----------------------------------------------------------------------===//
+void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
+                           Value value, Value TensorDesc, Value mask,
+                           CacheKindAttr l1_hint, CacheKindAttr l2_hint,
+                           CacheKindAttr l3_hint) {
+  state.addOperands(value);
+  state.addOperands(TensorDesc);
+  state.addOperands(mask);
+  if (l1_hint)
+    state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+  if (l2_hint)
+    state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+  if (l3_hint)
+    state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+  state.getOrAddProperties<Properties>().mode =
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
+                           Value value, Value TensorDesc, Value mask,
+                           CacheKind l1_hint, CacheKind l2_hint,
+                           CacheKind l3_hint) {
+  state.addOperands(value);
+  state.addOperands(TensorDesc);
+  state.addOperands(mask);
+  state.getOrAddProperties<Properties>().l1_hint =
+      CacheKindAttr::get(builder.getContext(), l1_hint);
+  state.getOrAddProperties<Properties>().l2_hint =
+      CacheKindAttr::get(builder.getContext(), l2_hint);
+  ;
+  state.getOrAddProperties<Properties>().l3_hint =
+      CacheKindAttr::get(builder.getContext(), l3_hint);
+  ;
+  state.getOrAddProperties<Properties>().mode =
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+ParseResult StoreScatterOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+  llvm::SmallVector<Type> Types;
+  llvm::SMLoc OperandsLoc;
+
+  OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperandList(Operands))
     return failure();
 
-  if (parser.resolveOperands(valueOperands, valueTypes, valueOperandsLoc,
-                             result.operands))
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
-  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
-                             TensorDescOperandsLoc, result.operands))
+  if (parser.parseColon())
     return failure();
 
-  if (parser.resolveOperands(maskOperands, maskTypes, maskOperandsLoc,
-                             result.operands))
+  if (parser.parseTypeList(Types))
+    return failure();
+
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
     return failure();
+
   return success();
 }
 
 void StoreScatterOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  bool printSep = false;
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
 
   printer << ' ';
   printer << getValue();
@@ -1420,20 +1346,10 @@ void StoreScatterOp::print(OpAsmPrinter &printer) {
   printer << ' ';
   printer << getMask();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << getMode();
-    printSep = true;
-  }
-
-  printCacheHintAttrs<StoreScatterOp>(printer, *this, printSep);
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
@@ -1453,7 +1369,7 @@ LogicalResult StoreScatterOp::verify() {
   auto mode = getMode();
   auto mapping = tdescTy.getMapping();
 
-  if (mode != Mode::VC || mapping)
+  if (mode != ModeKind::VC || mapping)
     return emitOpError("StoreScatterOp only supports VC mode and mapping "
                        "attribute of TensorDesc is not expected.\n");
 
@@ -1490,67 +1406,64 @@ LogicalResult StoreScatterOp::verify() {
   return success();
 }
 
-void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
-                           Value value, Value TensorDesc, Value mask,
-                           CacheWriteHintAttr l1_hint,
-                           CacheWriteHintAttr l2_hint,
-                           CacheWriteHintAttr l3_hint) {
-  state.addOperands(value);
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchOp
+//===----------------------------------------------------------------------===//
+void PrefetchOp::build(OpBuilder &builder, OperationState &state,
+                       Value TensorDesc, CacheKindAttr l1_hint,
+                       CacheKindAttr l2_hint, CacheKindAttr l3_hint) {
   state.addOperands(TensorDesc);
-  state.addOperands(mask);
-  if (l1_hint) {
+  if (l1_hint)
     state.getOrAddProperties<Properties>().l1_hint = l1_hint;
-  }
-  if (l2_hint) {
+
+  if (l2_hint)
     state.getOrAddProperties<Properties>().l2_hint = l2_hint;
-  }
-  if (l3_hint) {
+
+  if (l3_hint)
     state.getOrAddProperties<Properties>().l3_hint = l3_hint;
-  }
+
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
 }
 
-void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
-                           Value value, Value TensorDesc, Value mask,
-                           CacheWriteHint l1_hint, CacheWriteHint l2_hint,
-                           CacheWriteHint l3_hint) {
-  state.addOperands(value);
+void PrefetchOp::build(OpBuilder &builder, OperationState &state,
+                       Value TensorDesc, CacheKind l1_hint, CacheKind l2_hint,
+                       CacheKind l3_hint) {
   state.addOperands(TensorDesc);
-  state.addOperands(mask);
   state.getOrAddProperties<Properties>().l1_hint =
-      CacheWriteHintAttr::get(builder.getContext(), l1_hint);
+      CacheKindAttr::get(builder.getContext(), l1_hint);
   state.getOrAddProperties<Properties>().l2_hint =
-      CacheWriteHintAttr::get(builder.getContext(), l2_hint);
-  ;
+      CacheKindAttr::get(builder.getContext(), l2_hint);
   state.getOrAddProperties<Properties>().l3_hint =
-      CacheWriteHintAttr::get(builder.getContext(), l3_hint);
-  ;
+      CacheKindAttr::get(builder.getContext(), l3_hint);
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
 }
 
 ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
-  llvm::ArrayRef<OpAsmParser::UnresolvedOperand> TensorDescOperands(
-      TensorDescRawOperands);
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+  llvm::SmallVector<Type> TensorDescTypes(1);
   llvm::SMLoc TensorDescOperandsLoc;
-  Type TensorDescRawTypes[1];
-  llvm::ArrayRef<Type> TensorDescTypes(TensorDescRawTypes);
 
   TensorDescOperandsLoc = parser.getCurrentLocation();
-  if (parser.parseOperand(TensorDescRawOperands[0]))
+  if (parser.parseOperand(TensorDescOperands[0]))
     return failure();
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"mode", "l1_hint", "l2_hint", "l3_hint"}))
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
     return failure();
 
   if (parser.parseColon())
     return failure();
 
-  if (parser.parseType(TensorDescRawTypes[0]))
+  if (parser.parseType(TensorDescTypes[0]))
     return failure();
+
   if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
                              TensorDescOperandsLoc, result.operands))
     return failure();
@@ -1559,27 +1472,15 @@ ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) {
 
 void PrefetchOp::print(OpAsmPrinter &printer) {
   auto mode = getMode();
-  bool printSep = false;
   auto printDefaults = printDefaultValues();
-  auto numAttrs = (*this)->getAttrs().size();
 
   printer << ' ';
   printer << getTensorDesc();
 
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << ' ' << "{";
-  }
-
-  if (printDefaults || mode != Mode::SIMT) {
-    printer << "mode = " << getMode();
-    printSep = true;
-  }
-
-  printCacheHintAttrs<PrefetchOp>(printer, *this, printSep);
-
-  if (printDefaults || mode != Mode::SIMT || numAttrs > 1) {
-    printer << "}";
-  }
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
 
   printer << ' ' << ":";
   printer << ' ';
@@ -1591,89 +1492,235 @@ LogicalResult PrefetchOp::verify() {
   auto tdescTy = getTensorDesc().getType();
   auto mapping = tdescTy.getMapping();
 
-  if (tdescTy.getScattered())
+  auto isValidHint = [&](CacheKindAttr attr) -> bool {
+    if (!attr)
+      return true;
+    auto kind = attr.getValue();
+    return kind == CacheKind::CACHED || kind == CacheKind::UNCACHED ||
+           kind == CacheKind::STREAMING || kind == CacheKind::READ_INVALIDATE;
+  };
+
+  if (!isValidHint(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isValidHint(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isValidHint(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+  if (!tdescTy.getScattered())
     return emitOpError("Invalid TensorDesc. PrefetchOp only works on "
                        "TensorDescs with ScatteredAttr.");
 
-  if (mode != Mode::VC || mapping) {
-    return emitOpError("PrefetchOp only supports VC mode. and mapping "
+  if (mode != ModeKind::VC || mapping) {
+    return emitOpError("PrefetchOp only supports VC mode, and mapping "
                        "attribute of TensorDesc is not expected.\n");
   }
 
   return success();
 }
 
-void PrefetchOp::build(OpBuilder &builder, OperationState &state,
-                       Value TensorDesc, CacheReadHintAttr l1_hint,
-                       CacheReadHintAttr l2_hint, CacheReadHintAttr l3_hint) {
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateOffsetOp
+//===----------------------------------------------------------------------===//
+void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
+                           Type result, Value TensorDesc, Value offsets) {
   state.addOperands(TensorDesc);
-  if (l1_hint)
-    state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+  state.addOperands(offsets);
+  state.getOrAddProperties<Properties>().mode =
+      xegpu::ModeKindAttr::get(builder.getContext(), xegpu::ModeKind::VC);
+  state.addTypes(result);
+}
 
-  if (l2_hint)
-    state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+ParseResult UpdateOffsetOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+  llvm::SmallVector<Type> Types;
 
-  if (l3_hint)
-    state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+  auto OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperandList(Operands))
+    return failure();
 
-  state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseTypeList(Types))
+    return failure();
+
+  if (parser.parseArrow())
+    return failure();
+
+  llvm::SmallVector<Type> resultTypes(1);
+  if (parser.parseType(resultTypes[0]))
+    return failure();
+  result.addTypes(resultTypes);
+
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+    return failure();
+  return success();
 }
 
-void PrefetchOp::build(OpBuilder &builder, OperationState &state,
-                       Value TensorDesc, CacheReadHint l1_hint,
-                       CacheReadHint l2_hint, CacheReadHint l3_hint) {
-  state.addOperands(TensorDesc);
-  state.getOrAddProperties<Properties>().l1_hint =
-      CacheReadHintAttr::get(builder.getContext(), l1_hint);
-  state.getOrAddProperties<Properties>().l2_hint =
-      CacheReadHintAttr::get(builder.getContext(), l2_hint);
-  state.getOrAddProperties<Properties>().l3_hint =
-      CacheReadHintAttr::get(builder.getContext(), l3_hint);
-  ;
-  state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+void UpdateOffsetOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+
+  printer << ' ';
+  printer << getTensorDesc();
+  printer << ",";
+  printer << ' ';
+  printer << getOffsets();
+
+  llvm::SmallVector<llvm::StringRef> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getTensorDesc().getType();
+  printer << ",";
+  printer << ' ';
+  printer << getOffsets().getType();
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getResult().getType();
 }
 
 LogicalResult UpdateOffsetOp::verify() {
+  auto mode = getMode();
+  if (mode != ModeKind::VC)
+    return emitOpError("UpdateOffsetOp only work on VC mode.\n");
+
   auto srcTy = getTensorDesc().getType();
-  auto offTy = getOffsets().getType();
   auto resTy = getResult().getType();
-
   if (srcTy != resTy)
-    return emitOpError(
-        "The result should have the same type"
-        "(shape and encoding attribute) as the input TensorDesc.");
-
-  auto shape = srcTy.getShape();
+    return emitOpError("The result should have the same type (shape and "
+                       "encoding attribute) as the input TensorDesc.");
 
   if (!srcTy.getScattered()) {
     return emitOpError("Invalid TensorDesc. UpdateOffsetOp only works on "
                        "TensorDescs with ScatteredAttr.");
   }
 
-  auto vecTy = llvm::dyn_cast<VectorType>(offTy);
-  if (!vecTy || vecTy.getRank() != 1)
+  auto offTy = llvm::dyn_cast<VectorType>(getOffsets().getType());
+  if (!offTy || offTy.getRank() != 1)
     return emitOpError("The offset should be an 1D vector.\n");
 
-  if (shape[0] != vecTy.getShape()[0])
+  auto shape = srcTy.getShape();
+  if (shape[0] != offTy.getShape()[0])
     return emitOpError(
         "The offset should have same length as the dim-0 of TensorDesc.");
 
   return success();
 }
 
-LogicalResult UpdateNDOffsetOp::verify() {
-  // number of offsets specified must match the rank of the tensor descriptor
-  if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) {
-    return emitOpError("Invalid number of offsets.");
+//===----------------------------------------------------------------------===//
+// XeGPU_DpasOp
+//===----------------------------------------------------------------------===//
+ParseResult DpasOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+  llvm::SmallVector<Type> Types;
+
+  llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperandList(Operands))
+    return failure();
+
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseTypeList(Types))
+    return failure();
+
+  if (parser.parseArrow())
+    return failure();
+
+  llvm::SmallVector<Type> resultTypes(1);
+  if (parser.parseType(resultTypes[0]))
+    return failure();
+  result.addTypes(resultTypes);
+
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+    return failure();
+
+  return success();
+}
+
+void DpasOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+
+  printer << ' ';
+  printer << getLhs();
+  printer << ",";
+  printer << ' ';
+  printer << getRhs();
+  if (Value value = getAcc())
+    printer << ", " << value;
+
+  llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getLhs().getType();
+  printer << ",";
+  printer << ' ';
+  printer << getRhs().getType();
+  if (getAcc()) {
+    printer << ",";
+    printer << ' ';
+    printer << llvm::ArrayRef<Type>(getAcc().getType());
   }
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getResult().getType();
+}
+
+LogicalResult DpasOp::verify() {
+  int64_t lhsRank = getLhsType().getRank();
+  int64_t rhsRank = getRhsType().getRank();
+  Type lhsElemType = getLhsType().getElementType();
+  Type rhsElemType = getRhsType().getElementType();
+
+  if (lhsElemType != rhsElemType)
+    return emitOpError("lhs and rhs element type does not match for dpas op");
+
+  if (getAcc() && getAccType() != getResultType())
+    return emitOpError("Accumulator and Result for dpas op should have the "
+                       "same type (both shape and element type).");
+
+  if (lhsRank != rhsRank || lhsRank != 3)
+    return emitOpError(
+        "lhs and rhs rank does not match for dpas op, or their rank is not 3.");
+
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_InvokeSIMDOp
+//===----------------------------------------------------------------------===//
 void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
                          SymbolRefAttr callee, TypeRange results,
-                         ArgTypeAttr argType, ValueRange operands) {
+                         ArgTypeKindAttr argType, ValueRange operands) {
   state.addOperands(operands);
   state.addAttribute("argType", argType);
   state.addAttribute("callee", callee);
@@ -1682,25 +1729,20 @@ void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
 
 void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
                          StringAttr callee, TypeRange results,
-                         ArgTypeAttr argType, ValueRange operands) {
+                         ArgTypeKindAttr argType, ValueRange operands) {
   build(builder, state, SymbolRefAttr::get(callee), results, argType, operands);
 }
 
 void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
                          llvm::StringRef callee, TypeRange results,
-                         ArgTypeAttr argType, ValueRange operands) {
+                         ArgTypeKindAttr argType, ValueRange operands) {
   build(builder, state, StringAttr::get(builder.getContext(), callee), results,
         argType, operands);
 }
 
-LogicalResult AtomicRMWOp::verify() {
-  auto mode = getMode();
-  if (mode != Mode::VC) {
-    return emitOpError("AtomicRMWOp only work on VC mode.\n");
-  }
-  return success();
-}
-
+//===----------------------------------------------------------------------===//
+// XeGPU_AtomicRMWOp
+//===----------------------------------------------------------------------===//
 void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result,
                         AtomicRMWKindAttr kind, Value tensorDesc, Value mask,
                         Value value) {
@@ -1710,7 +1752,7 @@ void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result,
     state.addOperands(value);
   state.getOrAddProperties<Properties>().kind = kind;
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
   state.addTypes(result);
 }
 
@@ -1724,10 +1766,161 @@ void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result,
   state.getOrAddProperties<Properties>().kind =
       AtomicRMWKindAttr::get(builder.getContext(), kind);
   state.getOrAddProperties<Properties>().mode =
-      ModeAttr::get(builder.getContext(), Mode::VC);
+      ModeKindAttr::get(builder.getContext(), ModeKind::VC);
   state.addTypes(result);
 }
 
+ParseResult AtomicRMWOp::parse(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+  llvm::SmallVector<Type, 1> Types;
+  llvm::SMLoc OperandsLoc;
+
+  llvm::SmallVector<Type> resultTypes(1);
+
+  xegpu::AtomicRMWKindAttr kindAttr;
+  if (parser.parseCustomAttributeWithFallback(kindAttr, Type{}))
+    return failure();
+  if (kindAttr)
+    result.getOrAddProperties<AtomicRMWOp::Properties>().kind = kindAttr;
+
+  OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperandList(Operands))
+    return failure();
+
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseTypeList(Types))
+    return failure();
+
+  if (parser.parseArrow())
+    return failure();
+
+  if (parser.parseCustomTypeWithFallback(resultTypes[0]))
+    return failure();
+  result.addTypes(resultTypes);
+
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+    return failure();
+  return success();
+}
+
+void AtomicRMWOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+
+  printer.printStrippedAttrOrType(getKindAttr());
+  printer << ' ';
+  printer << getTensorDesc();
+  printer << ",";
+  printer << ' ';
+  printer << getMask();
+  if (Value value = getValue())
+    printer << ", " << value;
+
+  llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+  elidedAttrs.push_back("kind");
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getOperation()->getOperandTypes();
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getResult().getType();
+}
+
+LogicalResult AtomicRMWOp::verify() {
+  auto mode = getMode();
+  if (mode != ModeKind::VC)
+    return emitOpError("AtomicRMWOp only work on VC mode.\n");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNbarrierOp
+//===----------------------------------------------------------------------===//
+ParseResult CreateNbarrierOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  llvm::SmallVector<OpAsmParser::UnresolvedOperand, 2> Operands;
+  llvm::SmallVector<Type> Types;
+  llvm::SMLoc OperandsLoc;
+
+  OperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperandList(Operands))
+    return failure();
+
+  auto loc = parser.getCurrentLocation();
+  if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+    return failure();
+
+  if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+        return parser.emitError(loc)
+               << "'" << result.name.getStringRef() << "' op ";
+      })))
+    return failure();
+
+  if (parser.parseColon())
+    return failure();
+
+  if (parser.parseLParen())
+    return failure();
+
+  if (parser.parseTypeList(Types))
+    return failure();
+
+  if (parser.parseRParen())
+    return failure();
+
+  if (parser.parseArrow())
+    return failure();
+
+  llvm::SmallVector<Type> resultTypes(1);
+  if (parser.parseType(resultTypes[0]))
+    return failure();
+
+  result.addTypes(resultTypes);
+  if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+    return failure();
+  return success();
+}
+
+void CreateNbarrierOp::print(OpAsmPrinter &printer) {
+  auto mode = getMode();
+  auto printDefaults = printDefaultValues();
+  llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+  if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+    elidedAttrs.push_back("mode");
+
+  printer << ' ';
+  printer << getNbarrierId();
+  printer << ",";
+  printer << ' ';
+  printer << getNbarrierRole();
+  printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+  printer << ' ' << ":";
+  printer << ' ' << "(";
+  printer << getNbarrierId().getType();
+  printer << ",";
+  printer << ' ';
+  printer << getNbarrierRole().getType();
+  printer << ")";
+  printer << ' ' << "->";
+  printer << ' ';
+  printer << getResult().getType();
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
index 24aa836f80d44..64a6f547fbd29 100644
--- a/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -9,12 +9,12 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 
-  // CHECK: xegpu.create_nd_tdesc
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %2 = xegpu.create_nd_tdesc %src[2, 4] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -24,8 +24,7 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK: xegpu.create_tdesc {{.*}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
                           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
@@ -34,14 +33,12 @@ func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_load_nd_vc({{.*}}) {
 func.func @test_load_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: %arg0[%arg1, %arg2]
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 0 : i64}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
   %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
   return
@@ -52,71 +49,62 @@ func.func @test_store_nd_vc(%src: memref<24x32xf16>, %dst: memref<24x32xf16>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %3 = xegpu.load_nd %1 {mode=vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
 
-  // CHECK: xegpu.store_nd
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK: xegpu.store_nd {{%[0-9], %[0-9]}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
   xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
   return
 }
 
-
 // CHECK-LABEL: func @test_dpas_vc({{.*}}) {
 func.func @test_dpas_vc(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
-  // CHECK: xegpu.dpas
+  // CHECK: xegpu.dpas {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b {mode = vc}: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
-
 // CHECK-LABEL: func @test_update_nd_offset_vc({{.*}}) {
 func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd {{%[0-9]}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
 
-  // CHECK: xegpu.update_nd_offset
+  // CHECK: xegpu.update_nd_offset {{%[0-9]}}, [{{%c[0-9], %c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %3 = xegpu.update_nd_offset %1, [%c0, %c1]: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc}: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 
   return
 }
 
 // CHECK-LABEL: func @test_prefetch_nd_vc({{.*}}) {
 func.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
-  // CHECK: xegpu.prefetch_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd {{%[0-9]}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16>
   xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16>
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
index 25e7de33c6c12..f80df161a543a 100644
--- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
@@ -10,7 +10,7 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32>
-  xegpu.atomic_rmw "addf" %1, %mask, %value {mode=vc}
+  xegpu.atomic_rmw #xegpu<atomic_rmw_kind addf> %1, %mask, %value {mode=vc}
         : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
 
   return
@@ -23,7 +23,7 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value :
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
-  xegpu.atomic_rmw "mulf" %1, %mask, %value {mode=vc}
+  xegpu.atomic_rmw mulf %1, %mask, %value {mode=vc}
         : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
 
   return
@@ -36,7 +36,7 @@ func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value :
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
-  xegpu.atomic_rmw "andi" %1, %mask, %value {mode=vc}
+  xegpu.atomic_rmw andi %1, %mask, %value {mode=vc}
         : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
 
   return
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
index 096451df04564..0f7229a02aa18 100644
--- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
@@ -10,7 +10,7 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32>
-  xegpu.atomic_rmw "addf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32>
+  xegpu.atomic_rmw addf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32>
 
   return
 }
@@ -21,7 +21,7 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value :
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32>
-  xegpu.atomic_rmw "mulf" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+  xegpu.atomic_rmw mulf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
 
   return
 }
@@ -32,7 +32,7 @@ func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value :
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32>
-  xegpu.atomic_rmw "andi" %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+  xegpu.atomic_rmw andi %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
 
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
index 9cbc8b4f7d94b..245d862e302a7 100644
--- a/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
@@ -7,8 +7,7 @@
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1 
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   return
@@ -16,8 +15,7 @@ func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) {
 func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1 
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index>
                             -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
@@ -26,8 +24,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) {
 func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                                           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
@@ -36,8 +33,7 @@ func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) {
 func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
                         : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
@@ -47,8 +43,7 @@ func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) {
 func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
               : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
diff --git a/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
index 2689d401dc316..a3cb890483e63 100644
--- a/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -8,13 +8,11 @@
 // CHECK-LABEL: func @test_load_gather_vc({{.*}}) {
 func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
@@ -24,31 +22,27 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
 // CHECK-LABEL: func @test_load_gather_vc_2({{.*}}) {
 func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, transpose = array<i64: 1, 0>} 
   // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
                : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   return
 }
 
-// CHECK-LABEL: func @test_load_gather_vc_4({{.*}}) {
-func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) {
+// CHECK-LABEL: func @test_load_gather_vc_3({{.*}}) {
+func.func @test_load_gather_vc_3(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered> 
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 1}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir
index d05a0b523c51d..0644565c3f002 100644
--- a/mlir/test/Dialect/XeGPU/IR/load_nd.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir
@@ -13,50 +13,38 @@ func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C :
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
-  // CHECK-SAME: -> vector<4x1x2xf16>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xf16>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
     : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1x2xf16>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x2xf16>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
     : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1xf32>
+  // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> -> vector<8x1xf32>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
   %7 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d>
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
-  // CHECK-SAME: -> vector<4x1x2xf16>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xf16>
   %8 = xegpu.load_nd %7 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> -> vector<4x1x2xf16>
 
   return
@@ -70,39 +58,27 @@ func.func @test_load_nd_bf16(%A: memref<24x32xbf16>, %B : memref<24x32xbf16>, %C
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xbf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
-  %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
-      : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a>
+  %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
-  // CHECK-SAME: -> vector<4x1x2xbf16>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xbf16>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> -> vector<4x1x2xbf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xbf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
-    : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b>
+  %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1x2xbf16>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x2xbf16>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> -> vector<8x1x2xbf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<24x32xbf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
-    : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+  %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1xf32>
+  // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_bf16_c> -> vector<8x1xf32>
 
   return
@@ -116,39 +92,28 @@ func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : mem
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<64x64xi8>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>>
-  %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
-      : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a>
+  %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 1}
-  // CHECK-SAME:  !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>>
-  // CHECK-SAME: -> vector<4x1x4xi8>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>> -> vector<4x1x4xi8>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x1x4xi8>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<64x64xi8>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
-    : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b>
+  %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1x4xi8>
+  // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x4xi8>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x1x4xi8>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<64x64xi8>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
-    : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c>
+  %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME:  !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-  // CHECK-SAME: -> vector<8x1xi32>
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xi32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> -> vector<8x1xi32>
 
   return
diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
index 8703f171ac9df..78980b551c067 100644
--- a/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
@@ -10,36 +10,31 @@ func.func @test_load_nd_simd_f32(%src: memref<24x32xf32>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}]
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %2 = xegpu.load_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME:{mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint = streaming}
-  // CHECK-SAME:!xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, l3_hint = #xegpu<cache_kind streaming>, mode = #xegpu<mode_kind vc>, transpose = array<i64: 1, 0>}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
   %3 = xegpu.load_nd %1 {mode= vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming} : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
   return
 }
 
 // CHECK-LABEL: func @test_load_nd_simd_f16({{.*}}) {
 func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: %arg0[%arg1, %arg2]
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
-      : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  // CHECK: xegpu.load_nd %{{[0-9]+}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 0 : i64}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> 
   %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
   return
 }
@@ -47,13 +42,11 @@ func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index
 // CHECK-LABEL: func @test_load_nd_simd_bf16({{.*}}) {
 func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xbf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : ui64 -> !xegpu.tensor_desc<8x16xbf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : ui64 -> !xegpu.tensor_desc<8x16xbf16>
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, vnni_axis = 1, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 1 : i64} 
   // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
   %2 = xegpu.load_nd %1 {mode=vc, vnni_axis = 1, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
 
@@ -62,14 +55,13 @@ func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index
 
 // CHECK-LABEL: func @test_load_nd_block_array_simd_f16({{.*}}) {
 func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[0, 0] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc}
       : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
   %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}
               : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
index aec7689e92e70..6e2cb4de4ce1d 100644
--- a/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
@@ -8,9 +8,11 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 
-  // CHECK: xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf32>
+  // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xf32>
   xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32>
 
   return
@@ -18,12 +20,14 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
 
 // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_1({{.*}}) {
 func.func @test_prefetch_nd_tdesc_vc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}] 
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
-      : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
-  // CHECK: xegpu.prefetch_nd %0 {mode = vc, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+  // CHECK: xegpu.prefetch_nd %{{[0-9]}} 
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16>
   xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16>
   return
 }
@@ -34,9 +38,11 @@ func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
 
-  // CHECK: xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xi8>
+  // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xi8>
   xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xi8>
 
   return
@@ -44,12 +50,13 @@ func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) {
 
 // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_bf16({{.*}}) {
 func.func @test_prefetch_nd_tdesc_vc_bf16(%src: memref<24x32xbf16>, %x : index, %y : index) {
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
       : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-  // CHECK: xegpu.prefetch_nd %0 {mode = vc, l1_hint = uncached, l2_hint = cached} : !xegpu.tensor_desc<8x16xbf16>
+  // CHECK: xegpu.prefetch_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind uncached>, l2_hint = #xegpu<cache_kind cached>, mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16>
   xegpu.prefetch_nd %1 {mode = vc, l1_hint = uncached, l2_hint = cached}: !xegpu.tensor_desc<8x16xbf16>
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
index 695c189627e1a..170b3a9fe8147 100644
--- a/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
@@ -9,25 +9,21 @@ func.func @test_store_nd_vc_bf16(%src: memref<24x32xbf16>, %dst: memref<24x32xbf
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-  %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
-      : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+  %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-  %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
-      : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+  %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
   %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
 
-  // CHECK: xegpu.store_nd
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16>
   xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16>
   return
@@ -38,25 +34,22 @@ func.func @test_store_nd_vc_f64(%src: memref<24x32xf64>, %dst: memref<24x32xf64>
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
-  %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
-      : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
+  %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64>
   %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64>
 
-  // CHECK: xegpu.store_nd
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64>
   xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64>
   return
@@ -67,25 +60,23 @@ func.func @test_store_nd_vc_i8(%src: memref<24x32xi8>, %dst: memref<24x32xi8>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8>
   %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8>
 
-  // CHECK: xegpu.store_nd
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8>
   xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8>
   return
diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
index 4bc631acc5125..6d98ac3950c31 100644
--- a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
@@ -7,25 +7,21 @@
 // CHECK-LABEL: func @test_store_scatter({{.*}}) {
 func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui64) {
   %0 = arith.constant dense<true>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
           : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                   : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
-  // CHECK: xegpu.store
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
   xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
                   : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
index d1e57ddda45e2..c1a51712e7003 100644
--- a/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
@@ -7,26 +7,22 @@
 // CHECK-LABEL: func @test_store_scatter_vc({{.*}}) {
 func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>} 
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
-  // CHECK: xegpu.store
-  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
-  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+  // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> 
   xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
                 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
   return
diff --git a/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
index e25edcdd72b2a..1b97be77a2d79 100644
--- a/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
@@ -8,22 +8,20 @@ func.func @test_update_nd_offset_vc_0(%src: memref<24x32xf32>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
-  // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc}
-  // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}]
+  // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 
-  // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load_nd %{{[0-9]}}
+  // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}
               : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
 
-  // CHECK: xegpu.update_nd_offset
+  // CHECK: xegpu.update_nd_offset %{{[0-9]}}, [%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc}
-            : !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 
   return
 }
diff --git a/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
index 0852484423693..05b0092d2379b 100644
--- a/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
@@ -7,14 +7,12 @@
 // CHECK-LABEL: func @test_update_offset_VC({{.*}}) {
 func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc}
+  // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
               : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
         : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
@@ -22,7 +20,7 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %3 = arith.constant dense<16>: vector<16 x index>
   %4 = arith.addi %offsets, %3: vector<16 x index>
 
-  // CHECK: xegpu.update_offset
+  // CHECK: xegpu.update_offset %{{[0-9]}}, %{{[0-9]}} {mode = #xegpu<mode_kind vc>}
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %5 = xegpu.update_offset %1, %4 {mode = vc}
       : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>