From 6a940afdb720bc761a419bef6afab287756d33e8 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Tue, 30 May 2023 15:26:03 +0800
Subject: [PATCH] 256 avx2 and optimize sbox via reduce logic operations

---
 _asm/bs_amd64_asm.go    | 3605 ++++++++++++++++++++++---------
 bs128.go                |    2 +-
 bs128_test.go           |   10 +-
 bs_amd64.go             |   20 +-
 bs_amd64.s              | 4469 +++++++++++++++++++++++++++++++++------
 transpose_amd64_test.go |   47 +
 6 files changed, 6420 insertions(+), 1733 deletions(-)

diff --git a/_asm/bs_amd64_asm.go b/_asm/bs_amd64_asm.go
index ebc0f45..c79cfe7 100644
--- a/_asm/bs_amd64_asm.go
+++ b/_asm/bs_amd64_asm.go
@@ -203,7 +203,7 @@ func transpose128() {
 	RET()
 }
 
-func getFirst4Bytes(flipMask, in Mem, o, addr, x Register) {
+func getFirst4Bytes128(flipMask, in Mem, o, addr, x Register) {
 	for i := 0; i < 4; i++ {
 		MOVL(in.Idx(addr, 1), o)
 		PINSRD(Imm(uint64(i)), o, x)
@@ -212,6 +212,15 @@ func getFirst4Bytes(flipMask, in Mem, o, addr, x Register) {
 	PSHUFB(flipMask, x)
 }
 
+func getFirst4Bytes256(flipMask, in Mem, o, addr, x Register) {
+	for i := 0; i < 4; i++ {
+		MOVL(in.Idx(addr, 1), o)
+		PINSRD(Imm(uint64(i)), o, x)
+		ADDQ(Imm(32), addr)
+	}
+	PSHUFB(flipMask, x)
+}
+
 func transpose128avx(flipMask Mem) {
 	// transpose128avx function
 	TEXT("transpose128avx", NOSPLIT, "func(in, out *byte)")
@@ -242,14 +251,14 @@ func transpose128avx(flipMask Mem) {
 	SHRQ(Imm(3), addr)
 
 	Comment("Construct eight XMM with first 4 bytes of first 32 rows")
-	getFirst4Bytes(flipMask, in, o, addr, t1)
-	getFirst4Bytes(flipMask, in, o, addr, t2)
-	getFirst4Bytes(flipMask, in, o, addr, t3)
-	getFirst4Bytes(flipMask, in, o, addr, t4)
-	getFirst4Bytes(flipMask, in, o, addr, t5)
-	getFirst4Bytes(flipMask, in, o, addr, t6)
-	getFirst4Bytes(flipMask, in, o, addr, t7)
-	getFirst4Bytes(flipMask, in, o, addr, t8)
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
 
 	Comment("Matrix transform 4x4")
 	VPUNPCKHDQ(t2, t1, h)
@@ -366,250 +375,198 @@ func transpose128avx(flipMask Mem) {
 	RET()
 }
 
-func transpose128Rev() {
-	// transpose128Rev function
-	TEXT("transpose128Rev", NOSPLIT, "func(in, out *byte)")
-	Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128")
+func transpose256avx(flipMask Mem) {
+	// transpose256avx function
+	TEXT("transpose256avx", NOSPLIT, "func(in, out *byte)")
+	Doc("Bit level matrix transpose, 256x128 => 128x256")
 
 	in := Mem{Base: Load(Param("in"), GP64())}
 	out := Mem{Base: Load(Param("out"), GP64())}
 
-	tmp := XMM()
-	b := GP8()
+	h, l := X1, X0
+	t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
+	tmp := Y0
 	o := GP32()
-
-	Comment("Initialize rr, current row, 96")
-	rr := zero()
 	cc := GP64()
-	addr := GP64()
 
-	Label("row_loop_b3")
+	Comment("Initialize rr, current row")
+	rr := zero()
+	Label("row_loop")
 	Comment("Initialize cc, current col")
 	XORQ(cc, cc)
-	Label("col_loop_b3")
+	Label("col_loop")
+
 	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	addr := GP64()
 	MOVQ(rr, addr)
-	ADDQ(Imm(96), addr)
 	Comment("Multiple with ncols")
 	SHLQ(Imm(7), addr)
 	ADDQ(cc, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Construct one XMM with first byte of first 16 rows")
-	for i := 0; i < 16; i++ {
-		MOVB(in.Idx(addr, 1), b)
-		PINSRB(Imm(uint64(i)), b.As32(), tmp)
-		Comment("Add ncols / 8")
-		ADDQ(Imm(16), addr)
-	}
+	Comment("Construct eight XMM with first 4 bytes of first 32 rows")
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
 
-	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
+
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
+
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	Comment("Multiple with nrows")
-	SHLQ(Imm(7), addr)
+	SHLQ(Imm(8), addr)
 	ADDQ(rr, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
-		PMOVMSKB(tmp, o)
-		MOVW(o.As16(), out.Idx(addr, 1))
-		PSLLQ(Imm(1), tmp)
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
 		Comment("Sub nrows / 8")
-		SUBQ(Imm(16), addr)
+		SUBQ(Imm(32), addr)
 	}
-
-	Comment("Compare cc with ncols, here ncols=128")
 	ADDQ(Imm(8), cc)
-	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b3"))
-
-	Comment("Compare rr with nrows, here nrows=128")
-	ADDQ(Imm(16), rr)
-	CMPQ(rr, U8(32))
-	JL(LabelRef("row_loop_b3"))
-
-	Label("row_loop_b2")
-	Comment("Initialize cc, current col")
-	XORQ(cc, cc)
-	Label("col_loop_b2")
-	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(rr, addr)
-	ADDQ(Imm(32), addr)
-	Comment("Multiple with ncols")
-	SHLQ(Imm(7), addr)
-	ADDQ(cc, addr)
-	SHRQ(Imm(3), addr)
 
-	Comment("Construct one XMM with first byte of first 16 rows")
-	for i := 0; i < 16; i++ {
-		MOVB(in.Idx(addr, 1), b)
-		PINSRB(Imm(uint64(i)), b.As32(), tmp)
-		Comment("Add ncols / 8")
-		ADDQ(Imm(16), addr)
-	}
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
 
-	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	Comment("Multiple with nrows")
-	SHLQ(Imm(7), addr)
+	SHLQ(Imm(8), addr)
 	ADDQ(rr, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
-		PMOVMSKB(tmp, o)
-		MOVW(o.As16(), out.Idx(addr, 1))
-		PSLLQ(Imm(1), tmp)
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
 		Comment("Sub nrows / 8")
-		SUBQ(Imm(16), addr)
+		SUBQ(Imm(32), addr)
 	}
-
-	Comment("Compare cc with ncols, here ncols=128")
 	ADDQ(Imm(8), cc)
-	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b2"))
-
-	Comment("Compare rr with nrows, here nrows=128")
-	ADDQ(Imm(16), rr)
-	CMPQ(rr, U8(64))
-	JL(LabelRef("row_loop_b2"))
-
-	Label("row_loop_b1")
-	Comment("Initialize cc, current col")
-	XORQ(cc, cc)
-	Label("col_loop_b1")
-	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(rr, addr)
-	SUBQ(Imm(32), addr)
-	Comment("Multiple with ncols")
-	SHLQ(Imm(7), addr)
-	ADDQ(cc, addr)
-	SHRQ(Imm(3), addr)
 
-	Comment("Construct one XMM with first byte of first 16 rows")
-	for i := 0; i < 16; i++ {
-		MOVB(in.Idx(addr, 1), b)
-		PINSRB(Imm(uint64(i)), b.As32(), tmp)
-		Comment("Add ncols / 8")
-		ADDQ(Imm(16), addr)
-	}
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
 
-	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	Comment("Multiple with nrows")
-	SHLQ(Imm(7), addr)
+	SHLQ(Imm(8), addr)
 	ADDQ(rr, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
-		PMOVMSKB(tmp, o)
-		MOVW(o.As16(), out.Idx(addr, 1))
-		PSLLQ(Imm(1), tmp)
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
 		Comment("Sub nrows / 8")
-		SUBQ(Imm(16), addr)
+		SUBQ(Imm(32), addr)
 	}
-
-	Comment("Compare cc with ncols, here ncols=128")
 	ADDQ(Imm(8), cc)
-	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b1"))
-
-	Comment("Compare rr with nrows, here nrows=128")
-	ADDQ(Imm(16), rr)
-	CMPQ(rr, U8(96))
-	JL(LabelRef("row_loop_b1"))
-
-	Label("row_loop_b0")
-	Comment("Initialize cc, current col")
-	XORQ(cc, cc)
-	Label("col_loop_b0")
-	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(rr, addr)
-	SUBQ(Imm(96), addr)
-	Comment("Multiple with ncols")
-	SHLQ(Imm(7), addr)
-	ADDQ(cc, addr)
-	SHRQ(Imm(3), addr)
 
-	Comment("Construct one XMM with first byte of first 16 rows")
-	for i := 0; i < 16; i++ {
-		MOVB(in.Idx(addr, 1), b)
-		PINSRB(Imm(uint64(i)), b.As32(), tmp)
-		Comment("Add ncols / 8")
-		ADDQ(Imm(16), addr)
-	}
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
 
-	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	Comment("Multiple with nrows")
-	SHLQ(Imm(7), addr)
+	SHLQ(Imm(8), addr)
 	ADDQ(rr, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
-		PMOVMSKB(tmp, o)
-		MOVW(o.As16(), out.Idx(addr, 1))
-		PSLLQ(Imm(1), tmp)
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
 		Comment("Sub nrows / 8")
-		SUBQ(Imm(16), addr)
+		SUBQ(Imm(32), addr)
 	}
-
 	ADDQ(Imm(8), cc)
 
 	Comment("Compare cc with ncols, here ncols=128")
 	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b0"))
-	ADDQ(Imm(16), rr)
-	Comment("Compare rr with nrows, here nrows=128")
-	CMPQ(rr, U8(128))
-	JL(LabelRef("row_loop_b0"))
+	JL(LabelRef("col_loop"))
+	ADDQ(Imm(32), rr)
+	Comment("Compare rr with nrows, here nrows=256")
+	CMPQ(rr, U32(256))
+	JL(LabelRef("row_loop"))
 
+	VZEROUPPER()
 	RET()
 }
 
-func transpose128RevAvx(flipMask Mem) {
-	// transpose128RevAvx function
-	TEXT("transpose128RevAvx", NOSPLIT, "func(in, out *byte)")
-	Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128")
+func transpose128x256avx2(flipMask Mem) {
+	// transpose128x256avx2 function
+	TEXT("transpose128x256avx2", NOSPLIT, "func(in, out *byte)")
+	Doc("Bit level matrix transpose, 128x256 => 256x128, just for test here.")
 
 	in := Mem{Base: Load(Param("in"), GP64())}
 	out := Mem{Base: Load(Param("out"), GP64())}
 
 	h, l := X1, X0
-	tmp := Y0
 	t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
+	tmp := Y0
 	o := GP32()
-
-	Comment("Initialize rr, current row, 96")
-	rr := zero()
 	cc := GP64()
-	addr := GP64()
 
-	Label("row_loop_b3")
+	Comment("Initialize rr, current row")
+	rr := zero()
+	Label("row_loop")
 	Comment("Initialize cc, current col")
 	XORQ(cc, cc)
-	Label("col_loop_b3")
-	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(U32(12288), addr)
+	Label("col_loop")
+
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=256")
+	addr := GP64()
+	MOVQ(rr, addr)
+	Comment("Multiple with ncols")
+	SHLQ(Imm(8), addr)
 	ADDQ(cc, addr)
 	SHRQ(Imm(3), addr)
 
-	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
-	getFirst4Bytes(flipMask, in, o, addr, t1)
-	getFirst4Bytes(flipMask, in, o, addr, t2)
-	getFirst4Bytes(flipMask, in, o, addr, t3)
-	getFirst4Bytes(flipMask, in, o, addr, t4)
-	getFirst4Bytes(flipMask, in, o, addr, t5)
-	getFirst4Bytes(flipMask, in, o, addr, t6)
-	getFirst4Bytes(flipMask, in, o, addr, t7)
-	getFirst4Bytes(flipMask, in, o, addr, t8)
+	Comment("Construct eight XMM with first 4 bytes of first 32 rows")
+	getFirst4Bytes256(flipMask, in, o, addr, t1)
+	getFirst4Bytes256(flipMask, in, o, addr, t2)
+	getFirst4Bytes256(flipMask, in, o, addr, t3)
+	getFirst4Bytes256(flipMask, in, o, addr, t4)
+	getFirst4Bytes256(flipMask, in, o, addr, t5)
+	getFirst4Bytes256(flipMask, in, o, addr, t6)
+	getFirst4Bytes256(flipMask, in, o, addr, t7)
+	getFirst4Bytes256(flipMask, in, o, addr, t8)
 
 	Comment("Matrix transform 4x4")
 	VPUNPCKHDQ(t2, t1, h)
@@ -632,10 +589,14 @@ func transpose128RevAvx(flipMask Mem) {
 
 	MOVOU(t1, l)
 	VINSERTI128(Imm(1), t5, tmp, tmp)
+
 	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
-	SHLQ(Imm(4), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -649,10 +610,14 @@ func transpose128RevAvx(flipMask Mem) {
 
 	MOVOU(t2, l)
 	VINSERTI128(Imm(1), t6, tmp, tmp)
+
 	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
-	SHLQ(Imm(4), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -666,10 +631,14 @@ func transpose128RevAvx(flipMask Mem) {
 
 	MOVOU(t3, l)
 	VINSERTI128(Imm(1), t7, tmp, tmp)
+
 	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
-	SHLQ(Imm(4), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -683,10 +652,14 @@ func transpose128RevAvx(flipMask Mem) {
 
 	MOVOU(t4, l)
 	VINSERTI128(Imm(1), t8, tmp, tmp)
+
 	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
-	SHLQ(Imm(4), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -698,29 +671,262 @@ func transpose128RevAvx(flipMask Mem) {
 	}
 	ADDQ(Imm(8), cc)
 
-	Comment("Compare cc with ncols, here ncols=128")
-	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b3"))
-
+	Comment("Compare cc with ncols, here ncols=256")
+	CMPQ(cc, U32(256))
+	JL(LabelRef("col_loop"))
 	ADDQ(Imm(32), rr)
+	Comment("Compare rr with nrows, here nrows=128")
+	CMPQ(rr, U32(128))
+	JL(LabelRef("row_loop"))
+
+	VZEROUPPER()
+	RET()
+}
+
+func transpose128Rev() {
+	// transpose128Rev function
+	TEXT("transpose128Rev", NOSPLIT, "func(in, out *byte)")
+	Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128")
+
+	in := Mem{Base: Load(Param("in"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
+
+	tmp := XMM()
+	b := GP8()
+	o := GP32()
+
+	Comment("Initialize rr, current row, 96")
+	rr := zero()
+	cc := GP64()
+	addr := GP64()
+
+	Label("row_loop_b3")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b3")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	MOVQ(rr, addr)
+	ADDQ(Imm(96), addr)
+	Comment("Multiple with ncols")
+	SHLQ(Imm(7), addr)
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct one XMM with first byte of first 16 rows")
+	for i := 0; i < 16; i++ {
+		MOVB(in.Idx(addr, 1), b)
+		PINSRB(Imm(uint64(i)), b.As32(), tmp)
+		Comment("Add ncols / 8")
+		ADDQ(Imm(16), addr)
+	}
+
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	for i := 7; i >= 0; i-- {
+		PMOVMSKB(tmp, o)
+		MOVW(o.As16(), out.Idx(addr, 1))
+		PSLLQ(Imm(1), tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+
+	Comment("Compare cc with ncols, here ncols=128")
+	ADDQ(Imm(8), cc)
+	CMPQ(cc, Imm(128))
+	JL(LabelRef("col_loop_b3"))
+
+	Comment("Compare rr with nrows, here nrows=128")
+	ADDQ(Imm(16), rr)
+	CMPQ(rr, U8(32))
+	JL(LabelRef("row_loop_b3"))
+
 	Label("row_loop_b2")
 	Comment("Initialize cc, current col")
 	XORQ(cc, cc)
 	Label("col_loop_b2")
 	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(U32(8192), addr)
+	MOVQ(rr, addr)
+	ADDQ(Imm(32), addr)
+	Comment("Multiple with ncols")
+	SHLQ(Imm(7), addr)
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct one XMM with first byte of first 16 rows")
+	for i := 0; i < 16; i++ {
+		MOVB(in.Idx(addr, 1), b)
+		PINSRB(Imm(uint64(i)), b.As32(), tmp)
+		Comment("Add ncols / 8")
+		ADDQ(Imm(16), addr)
+	}
+
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	for i := 7; i >= 0; i-- {
+		PMOVMSKB(tmp, o)
+		MOVW(o.As16(), out.Idx(addr, 1))
+		PSLLQ(Imm(1), tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+
+	Comment("Compare cc with ncols, here ncols=128")
+	ADDQ(Imm(8), cc)
+	CMPQ(cc, Imm(128))
+	JL(LabelRef("col_loop_b2"))
+
+	Comment("Compare rr with nrows, here nrows=128")
+	ADDQ(Imm(16), rr)
+	CMPQ(rr, U8(64))
+	JL(LabelRef("row_loop_b2"))
+
+	Label("row_loop_b1")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b1")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	MOVQ(rr, addr)
+	SUBQ(Imm(32), addr)
+	Comment("Multiple with ncols")
+	SHLQ(Imm(7), addr)
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct one XMM with first byte of first 16 rows")
+	for i := 0; i < 16; i++ {
+		MOVB(in.Idx(addr, 1), b)
+		PINSRB(Imm(uint64(i)), b.As32(), tmp)
+		Comment("Add ncols / 8")
+		ADDQ(Imm(16), addr)
+	}
+
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	for i := 7; i >= 0; i-- {
+		PMOVMSKB(tmp, o)
+		MOVW(o.As16(), out.Idx(addr, 1))
+		PSLLQ(Imm(1), tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+
+	Comment("Compare cc with ncols, here ncols=128")
+	ADDQ(Imm(8), cc)
+	CMPQ(cc, Imm(128))
+	JL(LabelRef("col_loop_b1"))
+
+	Comment("Compare rr with nrows, here nrows=128")
+	ADDQ(Imm(16), rr)
+	CMPQ(rr, U8(96))
+	JL(LabelRef("row_loop_b1"))
+
+	Label("row_loop_b0")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b0")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	MOVQ(rr, addr)
+	SUBQ(Imm(96), addr)
+	Comment("Multiple with ncols")
+	SHLQ(Imm(7), addr)
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct one XMM with first byte of first 16 rows")
+	for i := 0; i < 16; i++ {
+		MOVB(in.Idx(addr, 1), b)
+		PINSRB(Imm(uint64(i)), b.As32(), tmp)
+		Comment("Add ncols / 8")
+		ADDQ(Imm(16), addr)
+	}
+
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	Comment("Multiple with nrows")
+	SHLQ(Imm(7), addr)
+	ADDQ(rr, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes")
+	for i := 7; i >= 0; i-- {
+		PMOVMSKB(tmp, o)
+		MOVW(o.As16(), out.Idx(addr, 1))
+		PSLLQ(Imm(1), tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=128")
+	CMPQ(cc, Imm(128))
+	JL(LabelRef("col_loop_b0"))
+	ADDQ(Imm(16), rr)
+	Comment("Compare rr with nrows, here nrows=128")
+	CMPQ(rr, U8(128))
+	JL(LabelRef("row_loop_b0"))
+
+	RET()
+}
+
+func transpose128RevAvx(flipMask Mem) {
+	// transpose128RevAvx function
+	TEXT("transpose128RevAvx", NOSPLIT, "func(in, out *byte)")
+	Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128")
+
+	in := Mem{Base: Load(Param("in"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
+
+	h, l := X1, X0
+	tmp := Y0
+	t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
+	o := GP32()
+
+	Comment("Initialize rr, current row, 96")
+	rr := zero()
+	cc := GP64()
+	addr := GP64()
+
+	Label("row_loop_b3")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b3")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	MOVQ(U32(12288), addr)
 	ADDQ(cc, addr)
 	SHRQ(Imm(3), addr)
 
 	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
-	getFirst4Bytes(flipMask, in, o, addr, t1)
-	getFirst4Bytes(flipMask, in, o, addr, t2)
-	getFirst4Bytes(flipMask, in, o, addr, t3)
-	getFirst4Bytes(flipMask, in, o, addr, t4)
-	getFirst4Bytes(flipMask, in, o, addr, t5)
-	getFirst4Bytes(flipMask, in, o, addr, t6)
-	getFirst4Bytes(flipMask, in, o, addr, t7)
-	getFirst4Bytes(flipMask, in, o, addr, t8)
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
 
 	Comment("Matrix transform 4x4")
 	VPUNPCKHDQ(t2, t1, h)
@@ -747,7 +953,6 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -765,7 +970,6 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -783,7 +987,6 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -801,7 +1004,6 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -815,28 +1017,27 @@ func transpose128RevAvx(flipMask Mem) {
 
 	Comment("Compare cc with ncols, here ncols=128")
 	CMPQ(cc, Imm(128))
-	JL(LabelRef("col_loop_b2"))
+	JL(LabelRef("col_loop_b3"))
 
 	ADDQ(Imm(32), rr)
-
-	Label("row_loop_b1")
+	Label("row_loop_b2")
 	Comment("Initialize cc, current col")
 	XORQ(cc, cc)
-	Label("col_loop_b1")
+	Label("col_loop_b2")
 	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
-	MOVQ(U32(4096), addr)
+	MOVQ(U32(8192), addr)
 	ADDQ(cc, addr)
 	SHRQ(Imm(3), addr)
 
 	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
-	getFirst4Bytes(flipMask, in, o, addr, t1)
-	getFirst4Bytes(flipMask, in, o, addr, t2)
-	getFirst4Bytes(flipMask, in, o, addr, t3)
-	getFirst4Bytes(flipMask, in, o, addr, t4)
-	getFirst4Bytes(flipMask, in, o, addr, t5)
-	getFirst4Bytes(flipMask, in, o, addr, t6)
-	getFirst4Bytes(flipMask, in, o, addr, t7)
-	getFirst4Bytes(flipMask, in, o, addr, t8)
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
 
 	Comment("Matrix transform 4x4")
 	VPUNPCKHDQ(t2, t1, h)
@@ -863,7 +1064,7 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(8), addr)
+	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -881,7 +1082,7 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(8), addr)
+	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -899,7 +1100,7 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(8), addr)
+	ADDQ(Imm(4), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -917,7 +1118,123 @@ func transpose128RevAvx(flipMask Mem) {
 	MOVQ(cc, addr)
 	ADDQ(Imm(7), addr)
 	SHLQ(Imm(4), addr)
-	ADDQ(Imm(8), addr)
+	ADDQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=128")
+	CMPQ(cc, Imm(128))
+	JL(LabelRef("col_loop_b2"))
+
+	ADDQ(Imm(32), rr)
+
+	Label("row_loop_b1")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b1")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=128")
+	MOVQ(U32(4096), addr)
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
+
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
+
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
+
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
 
 	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
 	for i := 7; i >= 0; i-- {
@@ -943,14 +1260,14 @@ func transpose128RevAvx(flipMask Mem) {
 	SHRQ(Imm(3), addr)
 
 	Comment("Construct eight XMM with first 4 bytes of first 32 rows")
-	getFirst4Bytes(flipMask, in, o, addr, t1)
-	getFirst4Bytes(flipMask, in, o, addr, t2)
-	getFirst4Bytes(flipMask, in, o, addr, t3)
-	getFirst4Bytes(flipMask, in, o, addr, t4)
-	getFirst4Bytes(flipMask, in, o, addr, t5)
-	getFirst4Bytes(flipMask, in, o, addr, t6)
-	getFirst4Bytes(flipMask, in, o, addr, t7)
-	getFirst4Bytes(flipMask, in, o, addr, t8)
+	getFirst4Bytes128(flipMask, in, o, addr, t1)
+	getFirst4Bytes128(flipMask, in, o, addr, t2)
+	getFirst4Bytes128(flipMask, in, o, addr, t3)
+	getFirst4Bytes128(flipMask, in, o, addr, t4)
+	getFirst4Bytes128(flipMask, in, o, addr, t5)
+	getFirst4Bytes128(flipMask, in, o, addr, t6)
+	getFirst4Bytes128(flipMask, in, o, addr, t7)
+	getFirst4Bytes128(flipMask, in, o, addr, t8)
 
 	Comment("Matrix transform 4x4")
 	VPUNPCKHDQ(t2, t1, h)
@@ -1051,89 +1368,567 @@ func transpose128RevAvx(flipMask Mem) {
 	RET()
 }
 
-func xor32x128() {
-	// xor32x128 function
-	TEXT("xor32x128", NOSPLIT, "func(x, y, out *byte)")
-	Doc("out = x xor y")
-	x := Mem{Base: Load(Param("x"), GP64())}
-	y := Mem{Base: Load(Param("y"), GP64())}
+func transpose256RevAvx(flipMask Mem) {
+	// transpose256RevAvx function
+	TEXT("transpose256RevAvx", NOSPLIT, "func(in, out *byte)")
+	Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x256")
+
+	in := Mem{Base: Load(Param("in"), GP64())}
 	out := Mem{Base: Load(Param("out"), GP64())}
 
-	X := XMM()
-	Y := XMM()
+	h, l := X1, X0
+	tmp := Y0
+	t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
+	o := GP32()
 
-	count := zero()
-	Label("xor32_loop")
-	MOVOU(x.Idx(count, 1), X)
-	MOVOU(y.Idx(count, 1), Y)
-	PXOR(X, Y)
-	MOVOU(Y, out.Idx(count, 1))
-	ADDQ(U8(16), count)
-	CMPQ(count, U32(512))
-	JL(LabelRef("xor32_loop"))
+	Comment("Initialize rr, current row, 96")
+	rr := zero()
+	cc := GP64()
+	addr := GP64()
 
-	RET()
-}
+	Label("row_loop_b3")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b3")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=256")
+	MOVQ(U32(24576), addr) // 96 * ncols
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
 
-func xor32x128avx() {
-	// xor32x128 function
-	TEXT("xor32x128avx", NOSPLIT, "func(x, y, out *byte)")
-	Doc("out = x xor y")
-	x := Mem{Base: Load(Param("x"), GP64())}
-	y := Mem{Base: Load(Param("y"), GP64())}
-	out := Mem{Base: Load(Param("out"), GP64())}
+	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
+	getFirst4Bytes256(flipMask, in, o, addr, t1)
+	getFirst4Bytes256(flipMask, in, o, addr, t2)
+	getFirst4Bytes256(flipMask, in, o, addr, t3)
+	getFirst4Bytes256(flipMask, in, o, addr, t4)
+	getFirst4Bytes256(flipMask, in, o, addr, t5)
+	getFirst4Bytes256(flipMask, in, o, addr, t6)
+	getFirst4Bytes256(flipMask, in, o, addr, t7)
+	getFirst4Bytes256(flipMask, in, o, addr, t8)
 
-	X := YMM()
-	Y := YMM()
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
 
-	count := zero()
-	Label("xor32_loop_avx")
-	VMOVDQU(x.Idx(count, 1), X)
-	VMOVDQU(y.Idx(count, 1), Y)
-	VPXOR(X, Y, Y)
-	VMOVDQU(Y, out.Idx(count, 1))
-	ADDQ(U8(32), count)
-	CMPQ(count, U32(512))
-	JL(LabelRef("xor32_loop_avx"))
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
 
-	VZEROUPPER()
-	RET()
-}
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
 
-func xorRoundKey128() {
-	// xorRoundKey128 function
-	TEXT("xorRoundKey128", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)")
-	Doc("xor x1, x2, x3 with round key, 16 bytes per bit")
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
 
-	x := Load(Param("rk"), GP32())
-	x1 := Mem{Base: Load(Param("x1"), GP64())}
-	x2 := Mem{Base: Load(Param("x2"), GP64())}
-	x3 := Mem{Base: Load(Param("x3"), GP64())}
-	out := Mem{Base: Load(Param("out"), GP64())}
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
 
-	ret := XMM()
-	one := XMM()
-	PCMPEQB(one, one)
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
 
-	y := GP32()
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
 
-	count := GP64()
-	XORQ(count, count)
-	Comment("Handle first byte")
-	MOVL(U32(0x01000000), y)
-	Label("rk_loop_1")
-	MOVOU(x1.Idx(count, 1), ret)
-	PXOR(x2.Idx(count, 1), ret)
-	PXOR(x3.Idx(count, 1), ret)
-	TESTL(x, y)
-	JZ(LabelRef("rk_loop_1_c"))
-	PXOR(one, ret)
-	Label("rk_loop_1_c")
-	MOVOU(ret, out.Idx(count, 1))
-	ROLL(U8(1), y)
-	ADDQ(U8(16), count)
-	CMPQ(count, U32(128))
-	JL(LabelRef("rk_loop_1"))
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=256")
+	CMPQ(cc, U32(256))
+	JL(LabelRef("col_loop_b3"))
+
+	ADDQ(Imm(32), rr)
+	Label("row_loop_b2")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b2")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=256")
+	MOVQ(U32(16384), addr) // 64 * ncols
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
+	getFirst4Bytes256(flipMask, in, o, addr, t1)
+	getFirst4Bytes256(flipMask, in, o, addr, t2)
+	getFirst4Bytes256(flipMask, in, o, addr, t3)
+	getFirst4Bytes256(flipMask, in, o, addr, t4)
+	getFirst4Bytes256(flipMask, in, o, addr, t5)
+	getFirst4Bytes256(flipMask, in, o, addr, t6)
+	getFirst4Bytes256(flipMask, in, o, addr, t7)
+	getFirst4Bytes256(flipMask, in, o, addr, t8)
+
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
+
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
+
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(4), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=256")
+	CMPQ(cc, U32(256))
+	JL(LabelRef("col_loop_b2"))
+
+	ADDQ(Imm(32), rr)
+
+	Label("row_loop_b1")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b1")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=256")
+	MOVQ(U32(8192), addr) // 32 * ncols
+	ADDQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct eight XMM with first 4 bytes of the 32 rows")
+	getFirst4Bytes256(flipMask, in, o, addr, t1)
+	getFirst4Bytes256(flipMask, in, o, addr, t2)
+	getFirst4Bytes256(flipMask, in, o, addr, t3)
+	getFirst4Bytes256(flipMask, in, o, addr, t4)
+	getFirst4Bytes256(flipMask, in, o, addr, t5)
+	getFirst4Bytes256(flipMask, in, o, addr, t6)
+	getFirst4Bytes256(flipMask, in, o, addr, t7)
+	getFirst4Bytes256(flipMask, in, o, addr, t8)
+
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
+
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
+
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(8), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=256")
+	CMPQ(cc, U32(256))
+	JL(LabelRef("col_loop_b1"))
+
+	ADDQ(Imm(32), rr)
+	Label("row_loop_b0")
+	Comment("Initialize cc, current col")
+	XORQ(cc, cc)
+	Label("col_loop_b0")
+	Comment("Initialize (rr * ncols + cc) / 8, here ncols=256")
+	MOVQ(cc, addr)
+	SHRQ(Imm(3), addr)
+
+	Comment("Construct eight XMM with first 4 bytes of first 32 rows")
+	getFirst4Bytes256(flipMask, in, o, addr, t1)
+	getFirst4Bytes256(flipMask, in, o, addr, t2)
+	getFirst4Bytes256(flipMask, in, o, addr, t3)
+	getFirst4Bytes256(flipMask, in, o, addr, t4)
+	getFirst4Bytes256(flipMask, in, o, addr, t5)
+	getFirst4Bytes256(flipMask, in, o, addr, t6)
+	getFirst4Bytes256(flipMask, in, o, addr, t7)
+	getFirst4Bytes256(flipMask, in, o, addr, t8)
+
+	Comment("Matrix transform 4x4")
+	VPUNPCKHDQ(t2, t1, h)
+	VPUNPCKLDQ(t2, t1, t1)
+	VPUNPCKLDQ(t4, t3, l)
+	VPUNPCKHDQ(t4, t3, t3)
+	VPUNPCKHQDQ(l, t1, t2)
+	VPUNPCKLQDQ(l, t1, t1)
+	VPUNPCKHQDQ(t3, h, t4)
+	VPUNPCKLQDQ(t3, h, t3)
+
+	VPUNPCKHDQ(t6, t5, h)
+	VPUNPCKLDQ(t6, t5, t5)
+	VPUNPCKLDQ(t8, t7, l)
+	VPUNPCKHDQ(t8, t7, t7)
+	VPUNPCKHQDQ(l, t5, t6)
+	VPUNPCKLQDQ(l, t5, t5)
+	VPUNPCKHQDQ(t7, h, t8)
+	VPUNPCKLQDQ(t7, h, t7)
+
+	MOVOU(t1, l)
+	VINSERTI128(Imm(1), t5, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(12), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t2, l)
+	VINSERTI128(Imm(1), t6, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(12), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t3, l)
+	VINSERTI128(Imm(1), t7, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(12), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	MOVOU(t4, l)
+	VINSERTI128(Imm(1), t8, tmp, tmp)
+	Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128")
+	MOVQ(cc, addr)
+	ADDQ(Imm(7), addr)
+	SHLQ(Imm(4), addr)
+	ADDQ(Imm(12), addr)
+
+	Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes")
+	for i := 7; i >= 0; i-- {
+		VPMOVMSKB(tmp, o)
+		MOVL(o, out.Idx(addr, 1))
+		VPSLLQ(Imm(1), tmp, tmp)
+		Comment("Sub nrows / 8")
+		SUBQ(Imm(16), addr)
+	}
+	ADDQ(Imm(8), cc)
+
+	Comment("Compare cc with ncols, here ncols=256")
+	CMPQ(cc, U32(256))
+	JL(LabelRef("col_loop_b0"))
+
+	VZEROUPPER()
+	RET()
+}
+
+func xor32x128() {
+	// xor32x128 function
+	TEXT("xor32x128", NOSPLIT, "func(x, y, out *byte)")
+	Doc("out = x xor y")
+	x := Mem{Base: Load(Param("x"), GP64())}
+	y := Mem{Base: Load(Param("y"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
+
+	X := XMM()
+	Y := XMM()
+
+	count := zero()
+	Label("xor32_loop")
+	MOVOU(x.Idx(count, 1), X)
+	MOVOU(y.Idx(count, 1), Y)
+	PXOR(X, Y)
+	MOVOU(Y, out.Idx(count, 1))
+	ADDQ(U8(16), count)
+	CMPQ(count, U32(512))
+	JL(LabelRef("xor32_loop"))
+
+	RET()
+}
+
+func xor32x128avx() {
+	// xor32x128 function
+	TEXT("xor32x128avx", NOSPLIT, "func(len int, x, y, out *byte)")
+	Doc("out = x xor y")
+	x := Mem{Base: Load(Param("x"), GP64())}
+	y := Mem{Base: Load(Param("y"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
+	len := Load(Param("len"), GP64())
+
+	X := YMM()
+	Y := YMM()
+
+	count := zero()
+	Label("xor32_loop_avx")
+	VMOVDQU(x.Idx(count, 1), X)
+	VMOVDQU(y.Idx(count, 1), Y)
+	VPXOR(X, Y, Y)
+	VMOVDQU(Y, out.Idx(count, 1))
+	ADDQ(U8(32), count)
+	CMPQ(count, len)
+	JL(LabelRef("xor32_loop_avx"))
+
+	VZEROUPPER()
+	RET()
+}
+
+func xorRoundKey128() {
+	// xorRoundKey128 function
+	TEXT("xorRoundKey128", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)")
+	Doc("xor x1, x2, x3 with round key, 16 bytes per bit")
+
+	x := Load(Param("rk"), GP32())
+	x1 := Mem{Base: Load(Param("x1"), GP64())}
+	x2 := Mem{Base: Load(Param("x2"), GP64())}
+	x3 := Mem{Base: Load(Param("x3"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
+
+	ret := XMM()
+	one := XMM()
+	PCMPEQB(one, one)
+
+	y := GP32()
+
+	count := GP64()
+	XORQ(count, count)
+	Comment("Handle first byte")
+	MOVL(U32(0x01000000), y)
+	Label("rk_loop_1")
+	MOVOU(x1.Idx(count, 1), ret)
+	PXOR(x2.Idx(count, 1), ret)
+	PXOR(x3.Idx(count, 1), ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_1_c"))
+	PXOR(one, ret)
+	Label("rk_loop_1_c")
+	MOVOU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(16), count)
+	CMPQ(count, U32(128))
+	JL(LabelRef("rk_loop_1"))
 
 	Comment("Handle second byte")
 	MOVL(U32(0x00010000), y)
@@ -1142,868 +1937,1616 @@ func xorRoundKey128() {
 	PXOR(x2.Idx(count, 1), ret)
 	PXOR(x3.Idx(count, 1), ret)
 	TESTL(x, y)
-	JZ(LabelRef("rk_loop_2_c"))
+	JZ(LabelRef("rk_loop_2_c"))
+	PXOR(one, ret)
+	Label("rk_loop_2_c")
+	MOVOU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(16), count)
+	CMPQ(count, U32(256))
+	JL(LabelRef("rk_loop_2"))
+
+	Comment("Handle third byte")
+	MOVL(U32(0x00000100), y)
+	Label("rk_loop_3")
+	MOVOU(x1.Idx(count, 1), ret)
+	PXOR(x2.Idx(count, 1), ret)
+	PXOR(x3.Idx(count, 1), ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_3_c"))
+	PXOR(one, ret)
+	Label("rk_loop_3_c")
+	MOVOU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(16), count)
+	CMPQ(count, U32(384))
+	JL(LabelRef("rk_loop_3"))
+
+	Comment("Handle last byte")
+	MOVL(U32(0x00000001), y)
+	Label("rk_loop_4")
+	MOVOU(x1.Idx(count, 1), ret)
+	PXOR(x2.Idx(count, 1), ret)
+	PXOR(x3.Idx(count, 1), ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_4_c"))
 	PXOR(one, ret)
-	Label("rk_loop_2_c")
+	Label("rk_loop_4_c")
 	MOVOU(ret, out.Idx(count, 1))
 	ROLL(U8(1), y)
 	ADDQ(U8(16), count)
-	CMPQ(count, U32(256))
-	JL(LabelRef("rk_loop_2"))
+	CMPQ(count, U32(512))
+	JL(LabelRef("rk_loop_4"))
+
+	RET()
+}
+
+func sbox128() {
+	// sbox128 function
+	TEXT("sbox128", NOSPLIT, "func(x, buffer *byte)")
+	Doc("sbox128, 128 bits per 'byte'")
+
+	b := Mem{Base: Load(Param("x"), GP64())}
+	buffer := Mem{Base: Load(Param("buffer"), GP64())}
+
+	Comment("f, for not operation")
+	f := XMM()
+	PCMPEQB(f, f)
+
+	Comment("Start input function")
+	Comment("t1=b7 ^ b5")
+	t1 := XMM()
+	MOVOU(b.Offset(7*16), t1)
+	PXOR(b.Offset(5*16), t1)
+
+	t2, t7, t8 := XMM(), XMM(), XMM()
+	MOVOU(b.Offset(1*16), t2)
+	MOVOU(t2, t7)
+	MOVOU(t2, t8)
+	Comment("store m6=b1")
+	MOVOU(t2, buffer.Offset((8+6)*16)) // m6
+	Comment("t2=b5 ^ b1")
+	PXOR(b.Offset(5*16), t2)
+	PANDN(f, t2)
+
+	t3, t4 := XMM(), XMM()
+	Comment("store g5=^b0")
+	MOVOU(b, t3)
+	MOVOU(t3, t4)
+	PANDN(f, t4)
+	MOVOU(t4, buffer.Offset(5*16)) // g5
+	Comment("t3=^(b0 ^ t2)")
+	PXOR(t2, t3)
+	PANDN(f, t3)
+
+	Comment("t4=b6 ^ b2")
+	t12 := XMM()
+	MOVOU(b.Offset(6*16), t4)
+	MOVOU(t4, t12)
+	PXOR(b.Offset(2*16), t4)
+
+	Comment("t5=b3 ^ t3")
+	t5, t11 := XMM(), XMM()
+	MOVOU(b.Offset(3*16), t5)
+	MOVOU(t5, t11)
+	PXOR(t3, t5)
+
+	Comment("t6=b4 ^ t1")
+	t6 := XMM()
+	MOVOU(b.Offset(4*16), t6)
+	PXOR(t1, t6)
+
+	Comment("t7=b1 ^ t5")
+	PXOR(t5, t7)
+	Comment("t8=b1 ^ t4")
+	PXOR(t4, t8)
+
+	Comment("t9=t6 ^ t8")
+	t9 := XMM()
+	MOVOU(t6, t9)
+	PXOR(t8, t9)
+	Comment("store m8")
+	MOVOU(t9, buffer.Offset((8+8)*16)) // m8
+	Comment("store g1")
+	MOVOU(t7, buffer.Offset(1*16)) // g1
+	Comment("store g3")
+	MOVOU(t5, buffer.Offset(3*16)) // g3
+	Comment("store g4")
+	MOVOU(t2, buffer.Offset(4*16)) // g4
+	Comment("store m0")
+	MOVOU(t6, buffer.Offset((8+0)*16)) // m0
+	Comment("store m1")
+	MOVOU(t3, buffer.Offset((8+1)*16)) // m1
+	Comment("store m2")
+	MOVOU(t8, buffer.Offset((8+2)*16)) // m2
+	Comment("store m4")
+	MOVOU(t4, buffer.Offset((8+4)*16)) // m4
+
+	Comment("t11=^(b3 ^ t1)")
+	PXOR(t1, t11)
+	PANDN(f, t11)
+	Comment("store m5, can reuse t1 now")
+	MOVOU(t11, buffer.Offset((8+5)*16)) // m5
+
+	Comment("t12=^(b6 ^ t9)")
+	PXOR(t9, t12)
+	PANDN(f, t12)
+	Comment("store m9, can reuse t7 t8 t9 now")
+	MOVOU(t12, buffer.Offset((8+9)*16)) // m9
+
+	Comment("t10=t6 ^ t7")
+	t10 := t7
+	PXOR(t6, t10)
+	Comment("store g0, can reuse t6 now")
+	MOVOU(t10, buffer) // g0
+
+	Comment("t13=t4 ^ t10")
+	t13 := t10
+	PXOR(t4, t13)
+	Comment("store g2, can reuse t4 now")
+	MOVOU(t13, buffer.Offset(2*16)) // g2
+
+	Comment("t14=t2 ^ t11")
+	t14 := t1
+	MOVOU(t11, t14)
+	PXOR(t2, t14)
+	Comment("store g6, can reuse t2 now")
+	MOVOU(t14, buffer.Offset(6*16)) // g6
+
+	Comment("t15=t12^t14")
+	t15 := t14
+	PXOR(t12, t15)
+	Comment("store g7")
+	MOVOU(t15, buffer.Offset(7*16)) // g7
+
+	Comment("t16=t3 ^ t12")
+	t16 := t12
+	PXOR(t3, t16)
+	Comment("store m3")
+	MOVOU(t16, buffer.Offset((8+3)*16)) // m3
+
+	Comment("t17=t11 ^ t16")
+	t17 := t16
+	PXOR(t11, t17)
+	Comment("store m7")
+	MOVOU(t17, buffer.Offset((8+7)*16)) // m7
+
+	Comment("Start top function")
+	Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8")
+	// t1 = g7
+	// t2 = g4
+	// t3 = m1
+	// t4 = m4
+	// t5 = g3
+	// t6 = m0
+	// t7 = g2
+	// t8 = m2
+	// t9 = m8
+	// t11 = m5
+	// t12 = m7
+	Comment("t2=m0 & m1")
+	PAND(t6, t3) // t2 := t3
+
+	Comment("t3=g0 & g4")
+	PAND(buffer, t2) // t3 := t2
+
+	Comment("t4=g3 & g7")
+	MOVOU(t1, t6)
+	PAND(t5, t1) // t4 := t1
+
+	Comment("t7=g3 | g7")
+	POR(t6, t5) // t7 := t5
+
+	Comment("t11=m4 & m5")
+	PAND(t4, t11) // t11
+
+	MOVOU(buffer.Offset((8+3)*16), t4) // t4 = m3
+	MOVOU(t4, t6)
+	Comment("t10=m3 & m2")
+	PAND(t8, t6) // t10 := t6
+	Comment("t12=m3 | m2")
+	POR(t8, t4) // t12 := t4
+
+	Comment("t6=g6 | g2")
+	POR(buffer.Offset(6*16), t7) // t6 := t7
+
+	Comment("t9=m6 | m7")
+	POR(buffer.Offset((8+6)*16), t12) // t9 := t12
+
+	t10 = XMM()
+	MOVOU(buffer.Offset((8+9)*16), t8) // t8 = m9
+	MOVOU(t8, t10)
+
+	Comment("t5=m8 & m9")
+	PAND(t9, t8) // t5 := t8
+	Comment("t8=m8 | m9")
+	POR(t9, t10) // t8 := t10
+
+	Comment("t14 = t3 ^ t2")
+	PXOR(t3, t2) // t14 = t3 ^ t2
+	Comment("t16 = t5 ^ t14")
+	PXOR(t2, t8) // t16 = t5 ^ t14, can reuse t2 now
+	Comment("t20 = t16 ^ t7")
+	PXOR(t8, t5) // t20 = t16 ^ t7
+	Comment("t17 = t9 ^ t10")
+	PXOR(t12, t6) // t17 = t9 ^ t10
+	Comment("t18 = t11 ^ t12")
+	PXOR(t11, t4) // t18 = t11 ^ t12
+	Comment("p2 = t20 ^ t18")
+	PXOR(t5, t4) // p2 = t20 ^ t18, can reuse t5 now
+	Comment("p0 = t6 ^ t16")
+	PXOR(t7, t8) // p0 = t6 ^ t16
+	Comment("t1 = g5 & g1")
+	MOVOU(buffer.Offset(1*16), t2)
+	MOVOU(buffer.Offset(5*16), t5)
+	PAND(t2, t5) // t1 := t5
+	Comment("t13 = t1 ^ t2")
+	PXOR(t5, t3) // t13 = t1 ^ t2
+	Comment("t15 = t13 ^ t4")
+	PXOR(t1, t3) // t15 = t4 ^ t13
+	Comment("t19 = t6 ^ t15")
+	PXOR(t3, t7) // t19 = t6 ^ t15
+	Comment("p3 = t19 ^ t17")
+	PXOR(t6, t7) // p3 = t19 ^ t17
+	Comment("p1 = t8 ^ t15")
+	PXOR(t10, t3) // p1 = t8 ^ t15
+
+	Comment("start middle function")
+	Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0")
+
+	// t3 = p1
+	// t4 = p2
+	// t7 = p3
+	// t8 = p0
+	Comment("t0 = p1 & p2")
+	MOVOU(t3, t1)
+	PAND(t4, t1) // t0 := t1
+
+	Comment("t1 = p3 & p0")
+	MOVOU(t8, t2)
+	PAND(t7, t2) // t1 := t2
+
+	Comment("t2 = p0 & p2")
+	MOVOU(t4, t5)
+	PAND(t8, t5) // t2 := t5
+
+	Comment("t3 = p1 & p3")
+	MOVOU(t3, t6)
+	PAND(t7, t6) // t3 := t6
+
+	Comment("t4 = t0 & t2")
+	MOVOU(t1, t9)
+	PAND(t5, t9) // t4 := t9
+
+	Comment("t5 = t1 & t3")
+	MOVOU(t2, t10)
+	PXOR(t6, t10) // t5 := t10
+
+	Comment("t6 = t5 | p0")
+	POR(t10, t8) // t6 := t8
+
+	Comment("t7 = t2 | p3")
+	POR(t5, t7) // t7
+
+	Comment("t8 = t4 ^ t6")
+	PXOR(t9, t8) // l3 = t8
+
+	Comment("t9 = t7 ^ t3")
+	PXOR(t7, t6) // t9 := t6
+
+	Comment("t10 = t0 ^ t9")
+	PXOR(t1, t6) // l0 = t10 := t6
+
+	Comment("t11 = p2 | t5")
+	POR(t10, t4) // t11 := t4
+	Comment("l1 = t11 ^ t1")
+	PXOR(t4, t2) // l1 := t2
+
+	Comment("t12 = p1 | t2")
+	POR(t5, t3) // t12 := t3
+	Comment("l2 = t12 ^ t5")
+	PXOR(t10, t3) // l2 := t3
+
+	Comment("start bottom function")
+	Comment("current register status: t6=l0, t2=l1, t3=l2, t8=l3")
+	Comment("k4 = l2 ^ l3")
+	MOVOU(t8, t5)
+	PXOR(t3, t5) // k4 := t5
+
+	Comment("k3 = l1 ^ l3")
+	MOVOU(t8, t4)
+	PXOR(t2, t4) // k3 := t4
+
+	Comment("k2 = l0 ^ l2")
+	MOVOU(t6, t7)
+	PXOR(t3, t7) // k2 := t7
+
+	Comment("k0 = l0 ^ l1")
+	MOVOU(t6, t1)
+	PXOR(t2, t1) // k0 := t1
+
+	Comment("k1 = k2 ^ k3")
+	MOVOU(t4, t9)
+	PXOR(t7, t9) // k1 := t9
+
+	Comment("e0=(m1 & k0)")
+	MOVOU(buffer.Offset((8+1)*16), t10) // m1
+	PAND(t1, t10)                       // e0 := t10
+
+	Comment("e1=(g5 & l1)")
+	MOVOU(buffer.Offset(5*16), t11)
+	PAND(t2, t11) // e1 := t11
+
+	Comment("r0=e0 ^ e1")
+	PXOR(t11, t10) // r0 = e0 ^ e1
+
+	Comment("e2=(g4 & l0)")
+	MOVOU(buffer.Offset(4*16), t12)
+	PAND(t6, t12)
+
+	Comment("r1=e2 ^ e1")
+	PXOR(t12, t11) // r1 = e2 ^ e1
+
+	Comment("store r0 r1")
+	MOVOU(t10, buffer.Offset(22*16)) // in fact, we can start from 18*16
+	MOVOU(t11, buffer.Offset(23*16))
+
+	Comment("e3=(m7 & k3)")
+	MOVOU(buffer.Offset((8+7)*16), t10) // m7
+	PAND(t4, t10)
+
+	Comment("e4=(m5 & k2)")
+	MOVOU(buffer.Offset((8+5)*16), t11) // m5
+	PAND(t7, t11)
+	Comment("r2=e3 ^ e4")
+	PXOR(t11, t10) // r2 = e3 ^ e4
+
+	Comment("e5=(m3 & k1)")
+	MOVOU(buffer.Offset((8+3)*16), t12) // m3
+	PAND(t9, t12)
+	Comment("r3=e5 ^ e4")
+	PXOR(t12, t11) // r3 = e5 ^ e4
+
+	Comment("store r2 r3")
+	MOVOU(t10, buffer.Offset(24*16))
+	MOVOU(t11, buffer.Offset(25*16))
+
+	Comment("e6=(m9 & k4)")
+	MOVOU(buffer.Offset((8+9)*16), t10) // m9
+	PAND(t5, t10)
+
+	Comment("e7=(g7 & l3)")
+	MOVOU(buffer.Offset(7*16), t11)
+	PAND(t8, t11)
+	Comment("r4=e7 ^ e6")
+	PXOR(t11, t10) // r4 = e6 ^ e7
+
+	Comment("e8=(g6 & l2)")
+	MOVOU(buffer.Offset(6*16), t12)
+	PAND(t3, t12)
+	Comment("r5=e8 ^ e6")
+	PXOR(t11, t12) // r5 = e8 ^ e7
+
+	Comment("store r4")
+	MOVOU(t10, buffer.Offset(26*16))
+
+	Comment("e9=(m0 & k0)")
+	MOVOU(buffer.Offset((8+0)*16), t10) // m0
+	PAND(t1, t10)                       // e9 := t10
+
+	Comment("e10=(g1 & l1)")
+	MOVOU(buffer.Offset(1*16), t1)
+	PAND(t2, t1) // e10 := t1
+
+	Comment("r6=e9 ^ e10")
+	PXOR(t1, t10) // r6 = e9 ^ e10
+
+	Comment("e11=(g0 & l0)")
+	MOVOU(buffer, t11)
+	PAND(t11, t6) // e11 := t6
+	Comment("r7=e11 ^ e10")
+	PXOR(t6, t1) // r7 = e11 ^ e10 = t1
 
-	Comment("Handle third byte")
-	MOVL(U32(0x00000100), y)
-	Label("rk_loop_3")
-	MOVOU(x1.Idx(count, 1), ret)
-	PXOR(x2.Idx(count, 1), ret)
-	PXOR(x3.Idx(count, 1), ret)
-	TESTL(x, y)
-	JZ(LabelRef("rk_loop_3_c"))
-	PXOR(one, ret)
-	Label("rk_loop_3_c")
-	MOVOU(ret, out.Idx(count, 1))
-	ROLL(U8(1), y)
-	ADDQ(U8(16), count)
-	CMPQ(count, U32(384))
-	JL(LabelRef("rk_loop_3"))
+	Comment("e12=(m6 & k3)")
+	MOVOU(buffer.Offset((8+6)*16), t2) // m6
+	PAND(t4, t2)
 
-	Comment("Handle last byte")
-	MOVL(U32(0x00000001), y)
-	Label("rk_loop_4")
-	MOVOU(x1.Idx(count, 1), ret)
-	PXOR(x2.Idx(count, 1), ret)
-	PXOR(x3.Idx(count, 1), ret)
-	TESTL(x, y)
-	JZ(LabelRef("rk_loop_4_c"))
-	PXOR(one, ret)
-	Label("rk_loop_4_c")
-	MOVOU(ret, out.Idx(count, 1))
-	ROLL(U8(1), y)
-	ADDQ(U8(16), count)
-	CMPQ(count, U32(512))
-	JL(LabelRef("rk_loop_4"))
+	Comment("e13=(m4 & k2)")
+	MOVOU(buffer.Offset((8+4)*16), t6) // m4
+	PAND(t7, t6)
+
+	Comment("r8=e12 ^ e13")
+	PXOR(t6, t2) // r8 = e12 ^ e13 = t2
+
+	Comment("e14=(m2 & k1)")
+	MOVOU(buffer.Offset((8+2)*16), t4) // m2
+	PAND(t9, t4)
+	Comment("r9=e14 ^ e13")
+	PXOR(t6, t4) // r9 = e14 ^ e13  = t4
+
+	Comment("e15=(m8 & k4)")
+	MOVOU(buffer.Offset((8+8)*16), t9) // m8
+	PAND(t9, t5)
+
+	Comment("e16=(g3 & l3)")
+	MOVOU(buffer.Offset(3*16), t9)
+	PAND(t9, t8)
+	Comment("r10=e15 ^ e16")
+	PXOR(t8, t5) // r10 = e15 ^ e16 = t5
+
+	Comment("e17=(g2 & l2)")
+	MOVOU(buffer.Offset(2*16), t11)
+	PAND(t11, t3)
+	Comment("r11=e17 ^ e16")
+	PXOR(t8, t3) // r11 = e17 ^ e16 = t3
+
+	Comment("start output function")
+	// t12 = r5
+	// t10 = r6
+	// t1 = r7
+	// t2 = r8
+	// t4 = r9
+	// t5 = r10
+	// t3 = r11
+	Comment("[t1]=r7 ^ r9")
+	PXOR(t1, t4) // [t1] = t4 = r7 ^ r9
+
+	Comment("[t2]=t1 ^ r1")
+	MOVOU(buffer.Offset((22+1)*16), t6) // r1
+	PXOR(t4, t6)                        // [t2] = t6 = r1 ^ [t1]
+
+	Comment("[t3]=t2 ^ r3")
+	MOVOU(buffer.Offset((22+3)*16), t7) // r3
+	MOVOU(t7, t8)
+	PXOR(t6, t7) // [t3] = t7 = r3 ^ [t2]
+	Comment("[t4]=r5 ^ r3")
+	PXOR(t12, t8) // [t4] = t8 = r5 ^ r3
+
+	Comment("[t5]=r4 ^ [t4]")
+	MOVOU(buffer.Offset((22+4)*16), t9) // r4
+	MOVOU(t9, t11)
+	PXOR(t8, t9) // [t5] = t9 = r4 ^ t4
+	Comment("[t6]=r0 ^ r4")
+	PXOR(buffer.Offset(22*16), t11) // [t6] = t11 = r4 ^ r0
+
+	Comment("[t7]=r11 ^ r7")
+	PXOR(t3, t1) // [t7] t1 = r7 ^ r11
+
+	Comment("[t8]=[t1] ^ [t4]")
+	PXOR(t4, t8) // t8 = t4 ^ t11
+	Comment("store t8")
+	MOVOU(t8, b.Offset(5*16))
+
+	Comment("[t9]=[t1] ^ [t6]")
+	PXOR(t11, t4) // [t9] = t4
+	Comment("store t9")
+	MOVOU(t4, b.Offset(2*16))
+
+	Comment("[t10]=r2 ^ t5")
+	PXOR(buffer.Offset((22+2)*16), t9) // [t10] t9 = r2 ^ [t5]
+	Comment("[t11]=r10 ^ r8")
+	PXOR(t5, t2) // [t11] = t2
+	Comment("store t11")
+	MOVOU(t2, b.Offset(3*16))
+	Comment("[t12]=^([t3] ^ [t11])")
+	PXOR(t7, t2)
+	PANDN(f, t2) // [t12] = t2
+	Comment("store t12")
+	MOVOU(t2, b.Offset(1*16))
+	Comment("[t13]=[t10] ^ [t12]")
+	PXOR(t2, t9) // [t13] = t9
+	Comment("store t13")
+	MOVOU(t9, b.Offset(6*16))
+
+	Comment("[t14]=^([t3] ^ [t7])")
+	PXOR(t7, t1)
+	PANDN(f, t1) // [t14]
+	Comment("store t14")
+	MOVOU(t1, b.Offset(4*16))
+	Comment("[t16]=[t6] ^ [t14]")
+	PXOR(t11, t1) // [t16]
+	Comment("store t16")
+	MOVOU(t1, b)
+
+	Comment("[t15]=^(r10 ^ r6)")
+	PXOR(t10, t5)
+	PANDN(f, t5)
+	Comment("store t15")
+	MOVOU(t5, b.Offset(7*16))
 
 	RET()
 }
 
-func sbox128() {
-	// sbox128 function
-	TEXT("sbox128", NOSPLIT, "func(x, buffer *byte)")
-	Doc("sbox128, 128 bits per 'byte'")
+// 0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+// 24 25 26 27 28 29 30 31 |  0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23
+// 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29
+// 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5
+// 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13
+func l128() {
+	// l128 function
+	TEXT("l128", NOSPLIT, "func(x, buffer *byte)")
+	Doc("l128, 128 bits per 'byte'")
 
 	b := Mem{Base: Load(Param("x"), GP64())}
 	buffer := Mem{Base: Load(Param("buffer"), GP64())}
 
-	Comment("f, for not operation")
-	f := XMM()
-	PCMPEQB(f, f)
+	X0, X1, X2, X3, X4, X5, X6, X7, X8, X9 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
 
-	Comment("Start input function")
-	Comment("t1=b7 ^ b5")
-	t1 := XMM()
-	MOVOU(b.Offset(7*16), t1)
-	PXOR(b.Offset(5*16), t1)
+	MOVOU(b, X0)
+	MOVOU(b.Offset(8*16), X1)
+	MOVOU(b.Offset(16*16), X2)
+	MOVOU(b.Offset(24*16), X3)
+	MOVOU(b.Offset(18*16), X5)
+	MOVOU(b.Offset(22*16), X6)
+	MOVOU(b.Offset(26*16), X7)
+	MOVOU(b.Offset(30*16), X8)
+	MOVOU(b.Offset(2*16), X9)
 
-	t2, t7, t8 := XMM(), XMM(), XMM()
-	MOVOU(b.Offset(1*16), t2)
-	MOVOU(t2, t7)
-	MOVOU(t2, t8)
-	Comment("store m6=b1")
-	MOVOU(t2, buffer.Offset((8+6)*16)) // m6
-	Comment("t2=b5 ^ b1")
-	PXOR(b.Offset(5*16), t2)
-	PANDN(f, t2)
+	Comment("0=0^24^14^22^30")
+	MOVOU(X0, X4)
+	PXOR(X3, X4)
+	PXOR(b.Offset(14*16), X4)
+	PXOR(X6, X4)
+	PXOR(X8, X4)
+	MOVOU(X4, buffer)
 
-	t3, t4 := XMM(), XMM()
-	Comment("store g5=^b0")
-	MOVOU(b, t3)
-	MOVOU(t3, t4)
-	PANDN(f, t4)
-	MOVOU(t4, buffer.Offset(5*16)) // g5
-	Comment("t3=^(b0 ^ t2)")
-	PXOR(t2, t3)
-	PANDN(f, t3)
+	Comment("2=0^2^26^8^16")
+	MOVOU(X0, X4)
+	PXOR(X9, X4)
+	PXOR(X7, X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	MOVOU(X4, buffer.Offset(2*16))
 
-	Comment("t4=b6 ^ b2")
-	t12 := XMM()
-	MOVOU(b.Offset(6*16), t4)
-	MOVOU(t4, t12)
-	PXOR(b.Offset(2*16), t4)
+	Comment("8=0^8^22^30^6")
+	MOVOU(X0, X4)
+	PXOR(X1, X4)
+	PXOR(X6, X4)
+	PXOR(X8, X4)
+	PXOR(b.Offset(6*16), X4)
+	MOVOU(X4, buffer.Offset(8*16))
+
+	Comment("18=0^18^10^16^24")
+	MOVOU(X0, X4)
+	PXOR(X5, X4)
+	PXOR(b.Offset(10*16), X4)
+	PXOR(X2, X4)
+	PXOR(X3, X4)
+	MOVOU(X4, buffer.Offset(18*16))
+
+	Comment("26=0^26^18^24^8")
+	PXOR(X1, X0)
+	PXOR(X7, X0)
+	PXOR(X5, X0)
+	PXOR(X3, X0)
+	MOVOU(X0, buffer.Offset(26*16))
+
+	Comment("10=10^2^8^16^24")
+	MOVOU(X9, X4)
+	PXOR(b.Offset(10*16), X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	PXOR(X3, X4)
+	MOVOU(X4, buffer.Offset(10*16))
+
+	MOVOU(b.Offset(6*16), X0)
+	MOVOU(b.Offset(14*16), X5)
+	Comment("16=16^8^30^6^14")
+	PXOR(X2, X1)
+	PXOR(X8, X1)
+	PXOR(X0, X1)
+	PXOR(X5, X1)
+	MOVOU(X1, buffer.Offset(16*16))
+
+	Comment("24=24^16^6^14^22")
+	PXOR(X3, X2)
+	PXOR(X0, X2)
+	PXOR(X5, X2)
+	PXOR(X6, X2)
+	MOVOU(X2, buffer.Offset(24*16))
+
+	MOVOU(b.Offset(4*16), X1)
+	MOVOU(b.Offset(10*16), X2)
+	MOVOU(b.Offset(12*16), X3)
+	// X0=6, X1=4, X9=X4=2, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30
+	Comment("4=4^28^2^10^18")
+	MOVOU(X9, X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	PXOR(b.Offset(18*16), X4)
+	PXOR(b.Offset(28*16), X4)
+	MOVOU(X4, buffer.Offset(4*16))
+
+	Comment("20=20^12^18^26^2")
+	MOVOU(X9, X4)
+	PXOR(b.Offset(20*16), X4)
+	PXOR(X3, X4)
+	PXOR(b.Offset(18*16), X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(20*16))
+
+	Comment("28=28^20^26^2^10")
+	PXOR(b.Offset(28*16), X9)
+	PXOR(b.Offset(20*16), X9)
+	PXOR(X7, X9)
+	PXOR(X2, X9)
+	MOVOU(X9, buffer.Offset(28*16))
+
+	MOVOU(b.Offset(20*16), X9)
+	// X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30
+
+	Comment("6=6^30^4^12^20")
+	MOVOU(X1, X4)
+	PXOR(X0, X4)
+	PXOR(X3, X4)
+	PXOR(X8, X4)
+	PXOR(X9, X4)
+	MOVOU(X4, buffer.Offset(6*16))
+
+	Comment("12=12^4^10^18^26")
+	MOVOU(X1, X4)
+	PXOR(X3, X4)
+	PXOR(X2, X4)
+	PXOR(b.Offset(18*16), X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(12*16))
+
+	MOVOU(b.Offset(28*16), X7)
+	// X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=28, X8=30
+	Comment("22=22^14^20^28^4")
+	MOVOU(X1, X4)
+	PXOR(X5, X4)
+	PXOR(X6, X4)
+	PXOR(X9, X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(22*16))
+
+	Comment("30=30^22^28^4^12")
+	PXOR(X8, X1)
+	PXOR(X6, X1)
+	PXOR(X3, X1)
+	PXOR(X7, X1)
+	MOVOU(X1, buffer.Offset(30*16))
+
+	Comment("14=14^6^12^20^28")
+	PXOR(X3, X0)
+	PXOR(X7, X0)
+	PXOR(X9, X0)
+	PXOR(X5, X0)
+	MOVOU(X0, buffer.Offset(14*16))
 
-	Comment("t5=b3 ^ t3")
-	t5, t11 := XMM(), XMM()
-	MOVOU(b.Offset(3*16), t5)
-	MOVOU(t5, t11)
-	PXOR(t3, t5)
+	MOVOU(b.Offset(1*16), X0)
+	MOVOU(b.Offset(9*16), X1)
+	MOVOU(b.Offset(17*16), X2)
+	MOVOU(b.Offset(25*16), X3)
+	MOVOU(b.Offset(19*16), X5)
+	MOVOU(b.Offset(23*16), X6)
+	MOVOU(b.Offset(27*16), X7)
+	MOVOU(b.Offset(31*16), X8)
+	MOVOU(b.Offset(3*16), X9)
 
-	Comment("t6=b4 ^ t1")
-	t6 := XMM()
-	MOVOU(b.Offset(4*16), t6)
-	PXOR(t1, t6)
+	Comment("1=1^25^15^23^31")
+	MOVOU(X0, X4)
+	PXOR(X3, X4)
+	PXOR(b.Offset(15*16), X4)
+	PXOR(X6, X4)
+	PXOR(X8, X4)
+	MOVOU(X4, buffer.Offset(1*16))
 
-	Comment("t7=b1 ^ t5")
-	PXOR(t5, t7)
-	Comment("t8=b1 ^ t4")
-	PXOR(t4, t8)
+	Comment("3=3^27^1^9^17")
+	MOVOU(X0, X4)
+	PXOR(X9, X4)
+	PXOR(X7, X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	MOVOU(X4, buffer.Offset(3*16))
 
-	Comment("t9=t6 ^ t8")
-	t9 := XMM()
-	MOVOU(t6, t9)
-	PXOR(t8, t9)
-	Comment("store m8")
-	MOVOU(t9, buffer.Offset((8+8)*16)) // m8
-	Comment("store g1")
-	MOVOU(t7, buffer.Offset(1*16)) // g1
-	Comment("store g3")
-	MOVOU(t5, buffer.Offset(3*16)) // g3
-	Comment("store g4")
-	MOVOU(t2, buffer.Offset(4*16)) // g4
-	Comment("store m0")
-	MOVOU(t6, buffer.Offset((8+0)*16)) // m0
-	Comment("store m1")
-	MOVOU(t3, buffer.Offset((8+1)*16)) // m1
-	Comment("store m2")
-	MOVOU(t8, buffer.Offset((8+2)*16)) // m2
-	Comment("store m4")
-	MOVOU(t4, buffer.Offset((8+4)*16)) // m4
+	Comment("9=9^1^23^31^7")
+	MOVOU(X0, X4)
+	PXOR(X1, X4)
+	PXOR(X6, X4)
+	PXOR(X8, X4)
+	PXOR(b.Offset(7*16), X4)
+	MOVOU(X4, buffer.Offset(9*16))
 
-	Comment("t11=^(b3 ^ t1)")
-	PXOR(t1, t11)
-	PANDN(f, t11)
-	Comment("store m5, can reuse t1 now")
-	MOVOU(t11, buffer.Offset((8+5)*16)) // m5
+	Comment("19=1^19^11^17^25")
+	MOVOU(X0, X4)
+	PXOR(X5, X4)
+	PXOR(b.Offset(11*16), X4)
+	PXOR(X2, X4)
+	PXOR(X3, X4)
+	MOVOU(X4, buffer.Offset(19*16))
 
-	Comment("t12=^(b6 ^ t9)")
-	PXOR(t9, t12)
-	PANDN(f, t12)
-	Comment("store m9, can reuse t7 t8 t9 now")
-	MOVOU(t12, buffer.Offset((8+9)*16)) // m9
+	Comment("27=1^27^19^25^9")
+	PXOR(X1, X0)
+	PXOR(X7, X0)
+	PXOR(X5, X0)
+	PXOR(X3, X0)
+	MOVOU(X0, buffer.Offset(27*16))
 
-	Comment("t10=t6 ^ t7")
-	t10 := t7
-	PXOR(t6, t10)
-	Comment("store g0, can reuse t6 now")
-	MOVOU(t10, buffer) // g0
+	Comment("11=11^3^9^17^25")
+	MOVOU(X9, X4)
+	PXOR(b.Offset(11*16), X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	PXOR(X3, X4)
+	MOVOU(X4, buffer.Offset(11*16))
 
-	Comment("t13=t4 ^ t10")
-	t13 := t10
-	PXOR(t4, t13)
-	Comment("store g2, can reuse t4 now")
-	MOVOU(t13, buffer.Offset(2*16)) // g2
+	MOVOU(b.Offset(7*16), X0)
+	MOVOU(b.Offset(15*16), X5)
+	Comment("17=17^9^31^7^15")
+	PXOR(X2, X1)
+	PXOR(X8, X1)
+	PXOR(X0, X1)
+	PXOR(X5, X1)
+	MOVOU(X1, buffer.Offset(17*16))
 
-	Comment("t14=t2 ^ t11")
-	t14 := t1
-	MOVOU(t11, t14)
-	PXOR(t2, t14)
-	Comment("store g6, can reuse t2 now")
-	MOVOU(t14, buffer.Offset(6*16)) // g6
+	Comment("25=25^17^7^15^23")
+	PXOR(X3, X2)
+	PXOR(X0, X2)
+	PXOR(X5, X2)
+	PXOR(X6, X2)
+	MOVOU(X2, buffer.Offset(25*16))
 
-	Comment("t15=t12^t14")
-	t15 := t14
-	PXOR(t12, t15)
-	Comment("store g7")
-	MOVOU(t15, buffer.Offset(7*16)) // g7
+	MOVOU(b.Offset(5*16), X1)
+	MOVOU(b.Offset(11*16), X2)
+	MOVOU(b.Offset(13*16), X3)
+	// X0=7, X1=5, X9=X4=3, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31
+	Comment("5=5^29^3^11^19")
+	MOVOU(X9, X4)
+	PXOR(X1, X4)
+	PXOR(X2, X4)
+	PXOR(b.Offset(19*16), X4)
+	PXOR(b.Offset(29*16), X4)
+	MOVOU(X4, buffer.Offset(5*16))
 
-	Comment("t16=t3 ^ t12")
-	t16 := t12
-	PXOR(t3, t16)
-	Comment("store m3")
-	MOVOU(t16, buffer.Offset((8+3)*16)) // m3
+	Comment("21=21^13^19^27^3")
+	MOVOU(X9, X4)
+	PXOR(b.Offset(21*16), X4)
+	PXOR(X3, X4)
+	PXOR(b.Offset(19*16), X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(21*16))
 
-	Comment("t17=t11 ^ t16")
-	t17 := t16
-	PXOR(t11, t17)
-	Comment("store m7")
-	MOVOU(t17, buffer.Offset((8+7)*16)) // m7
+	Comment("29=29^21^27^3^11")
+	PXOR(b.Offset(29*16), X9)
+	PXOR(b.Offset(21*16), X9)
+	PXOR(X7, X9)
+	PXOR(X2, X9)
+	MOVOU(X9, buffer.Offset(29*16))
 
-	Comment("Start top function")
-	Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8")
-	// t1 = g7
-	// t2 = g4
-	// t3 = m1
-	// t4 = m4
-	// t5 = g3
-	// t6 = m0
-	// t7 = g2
-	// t8 = m2
-	// t9 = m8
-	// t11 = m5
-	// t12 = m7
-	Comment("t2=^(m0 & m1)")
-	PAND(t6, t3)
-	PANDN(f, t3) // t2
+	MOVOU(b.Offset(21*16), X9)
+	// X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31
 
-	Comment("t3=^(g0 & g4)")
-	PAND(buffer, t2)
-	PANDN(f, t2) // t3
+	Comment("7=7^31^5^13^21")
+	MOVOU(X1, X4)
+	PXOR(X0, X4)
+	PXOR(X3, X4)
+	PXOR(X8, X4)
+	PXOR(X9, X4)
+	MOVOU(X4, buffer.Offset(7*16))
 
-	Comment("t4=^(g3 & g7)")
-	MOVOU(t1, t6)
-	PAND(t5, t1)
-	PANDN(f, t1) // t4
+	Comment("13=13^5^11^19^27")
+	MOVOU(X1, X4)
+	PXOR(X3, X4)
+	PXOR(X2, X4)
+	PXOR(b.Offset(19*16), X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(13*16))
 
-	Comment("t7=^(g3 | g7)")
-	POR(t6, t5)
-	PANDN(f, t5) // t7
+	MOVOU(b.Offset(29*16), X7)
+	// X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=29, X8=31
+	Comment("23=23^15^21^29^5")
+	MOVOU(X1, X4)
+	PXOR(X5, X4)
+	PXOR(X6, X4)
+	PXOR(X9, X4)
+	PXOR(X7, X4)
+	MOVOU(X4, buffer.Offset(23*16))
 
-	Comment("t11=^(m4 & m5)")
-	PAND(t4, t11)
-	PANDN(f, t11) // t11
+	Comment("31=31^23^29^5^13")
+	PXOR(X8, X1)
+	PXOR(X6, X1)
+	PXOR(X3, X1)
+	PXOR(X7, X1)
+	MOVOU(X1, buffer.Offset(31*16))
 
-	MOVOU(buffer.Offset((8+3)*16), t4) // t4 = m3
-	MOVOU(t4, t6)
-	Comment("t10=^( m3 & m2 )")
-	PAND(t8, t6)
-	PANDN(f, t6) // t10
-	Comment("t12=^( m3 | m2 )")
-	POR(t8, t4)
-	PANDN(f, t4) // t12
+	Comment("15=15^7^13^21^29")
+	PXOR(X3, X0)
+	PXOR(X7, X0)
+	PXOR(X9, X0)
+	PXOR(X5, X0)
+	MOVOU(X0, buffer.Offset(15*16))
 
-	Comment("t6=^( g6 | g2 )")
-	POR(buffer.Offset(6*16), t7)
-	PANDN(f, t7) // t6
+	RET()
+}
 
-	Comment("t9=^( m6 | m7 )")
-	POR(buffer.Offset((8+6)*16), t12)
-	PANDN(f, t12) // t9
+// 0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+// 24 25 26 27 28 29 30 31 |  0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23
+// 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29
+// 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5
+// 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13
+func l256() {
+	// l256 function
+	TEXT("l256", NOSPLIT, "func(x, buffer *byte)")
+	Doc("l256, 256 bits per 'byte'")
 
-	t10 = XMM()
-	MOVOU(buffer.Offset((8+9)*16), t8) // t8 = m9
-	MOVOU(t8, t10)
+	b := Mem{Base: Load(Param("x"), GP64())}
+	buffer := Mem{Base: Load(Param("buffer"), GP64())}
 
-	Comment("t5=^( m8 & m9 )")
-	PAND(t9, t8)
-	PANDN(f, t8) // t5
-	Comment("t8=^( m8 | m9 )")
-	POR(t9, t10)
-	PANDN(f, t10) // t8
+	y0, y1, y2, y3, y4, y5, y6, y7, y8, y9 := YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM()
 
-	Comment("t14 = t3 ^ t2")
-	PXOR(t3, t2) // t14 = t3 ^ t2
-	Comment("t16 = t5 ^ t14")
-	PXOR(t2, t8) // t16 = t5 ^ t14, can reuse t2 now
-	Comment("t20 = t16 ^ t7")
-	PXOR(t8, t5) // t20 = t16 ^ t7
-	Comment("t17 = t9 ^ t10")
-	PXOR(t12, t6) // t17 = t9 ^ t10
-	Comment("t18 = t11 ^ t12")
-	PXOR(t11, t4) // t18 = t11 ^ t12
-	Comment("p2 = t20 ^ t18")
-	PXOR(t5, t4) // p2 = t20 ^ t18, can reuse t5 now
-	Comment("p0 = t6 ^ t16")
-	PXOR(t7, t8) // p0 = t6 ^ t16
-	Comment("t1 = ^(g5 & g1)")
-	MOVOU(buffer.Offset(1*16), t2)
-	MOVOU(buffer.Offset(5*16), t5)
-	PAND(t2, t5)
-	PANDN(f, t5) // t1
-	Comment("t13 = t1 ^ t2")
-	PXOR(t5, t3) // t13 = t1 ^ t2
-	Comment("t15 = t13 ^ t4")
-	PXOR(t1, t3) // t15 = t4 ^ t13
-	Comment("t19 = t6 ^ t15")
-	PXOR(t3, t7) // t19 = t6 ^ t15
-	Comment("p3 = t19 ^ t17")
-	PXOR(t6, t7) // p3 = t19 ^ t17
-	Comment("p1 = t8 ^ t15")
-	PXOR(t10, t3) // p1 = t8 ^ t15
+	VMOVDQU(b, y0)
+	VMOVDQU(b.Offset(8*32), y1)
+	VMOVDQU(b.Offset(16*32), y2)
+	VMOVDQU(b.Offset(24*32), y3)
+	VMOVDQU(b.Offset(18*32), y5)
+	VMOVDQU(b.Offset(22*32), y6)
+	VMOVDQU(b.Offset(26*32), y7)
+	VMOVDQU(b.Offset(30*32), y8)
+	VMOVDQU(b.Offset(2*32), y9)
+
+	Comment("0=0^24^14^22^30")
+	VPXOR(y3, y0, y4)
+	VPXOR(b.Offset(14*32), y4, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y8, y4, y4)
+	VMOVDQU(y4, buffer)
 
-	Comment("start middle function")
-	Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0")
+	Comment("2=0^2^26^8^16")
+	VPXOR(y9, y0, y4)
+	VPXOR(y7, y4, y4)
+	VPXOR(y1, y4, y4)
+	VPXOR(y2, y4, y4)
+	VMOVDQU(y4, buffer.Offset(2*32))
 
-	// t3 = p1
-	// t4 = p2
-	// t7 = p3
-	// t8 = p0
-	Comment("t1 = ^(p3 & p0)")
-	MOVOU(t8, t1)
-	PAND(t7, t1)
-	PANDN(f, t1) // t1 = ^(p3 & p0)
-
-	Comment("t2 = ^(t1 | p2)")
-	MOVOU(t4, t2)
-	POR(t1, t2)
-	PANDN(f, t2) // t2 = ^(t1 | p2)
-
-	Comment("t3 = ^(p2 & p0)")
-	MOVOU(t4, t5) // p2
-	PAND(t8, t4)
-	PANDN(f, t4) // t4 = ^(p2 & p0)
-
-	Comment("t4 = p1 ^ t3")
-	PXOR(t3, t4) // t4 = p1 ^ t4
-
-	Comment("t5 = ^(p2 | t4)")
-	MOVOU(t5, t9) // p2
-	POR(t4, t5)
-	PANDN(f, t5) // t5 = ^(p2 | t4)
-
-	Comment("t6 = ^(p1 & t4)")
-	MOVOU(t3, t6) // p1
-	PAND(t4, t6)
-	PANDN(f, t6) // t6 = ^(p1 & t4)
-
-	Comment("t7 = ^(p3 | t4)")
-	MOVOU(t7, t11) // p3
-	POR(t4, t7)
-	PANDN(f, t7) // t7 = ^(p3 | t4)
-
-	Comment("t8 = ^(t7 | t2)")
-	MOVOU(t8, t12) // p0
-	MOVOU(t7, t8)
-	POR(t2, t8)
-	PANDN(f, t8) // t8 = ^(t7 | t2)
+	Comment("8=0^8^22^30^6")
+	VPXOR(y1, y0, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y8, y4, y4)
+	VPXOR(b.Offset(6*32), y4, y4)
+	VMOVDQU(y4, buffer.Offset(8*32))
 
-	Comment("t9 = ^(t7 ^ t5)")
-	PXOR(t5, t7)
-	PANDN(f, t7) // t7 = ^(t5 ^ t7)
+	Comment("18=0^18^10^16^24")
+	VPXOR(y5, y0, y4)
+	VPXOR(b.Offset(10*32), y4, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(y3, y4, y4)
+	VMOVDQU(y4, buffer.Offset(18*32))
 
-	Comment("t10 = ^(t9 ^ p3)")
-	PXOR(t7, t11)
-	PANDN(f, t11) // l0 = t11 = ^(t7 & p3)
+	Comment("26=0^26^18^24^8")
+	VPXOR(y1, y0, y0)
+	VPXOR(y7, y0, y0)
+	VPXOR(y5, y0, y0)
+	VPXOR(y3, y0, y0)
+	VMOVDQU(y0, buffer.Offset(26*32))
 
-	Comment("t11 = ^(t6 & t8)")
-	PAND(t8, t6)
-	PANDN(f, t6) // l2 = t6 = ^(t6 & t8)
+	Comment("10=10^2^8^16^24")
+	VPXOR(b.Offset(10*32), y9, y4)
+	VPXOR(y1, y4, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(y3, y4, y4)
+	VMOVDQU(y4, buffer.Offset(10*32))
+
+	VMOVDQU(b.Offset(6*32), y0)
+	VMOVDQU(b.Offset(14*32), y5)
+	Comment("16=16^8^30^6^14")
+	VPXOR(y2, y1, y1)
+	VPXOR(y8, y1, y1)
+	VPXOR(y0, y1, y1)
+	VPXOR(y5, y1, y1)
+	VMOVDQU(y1, buffer.Offset(16*32))
 
-	Comment("t12 = ^(p1 & t8)")
-	PAND(t8, t3)
-	PANDN(f, t3) // t3 = ^(t8 & p1)
+	Comment("24=24^16^6^14^22")
+	VPXOR(y3, y2, y2)
+	VPXOR(y0, y2, y2)
+	VPXOR(y5, y2, y2)
+	VPXOR(y6, y2, y2)
+	VMOVDQU(y2, buffer.Offset(24*32))
+
+	VMOVDQU(b.Offset(4*32), y1)
+	VMOVDQU(b.Offset(10*32), y2)
+	VMOVDQU(b.Offset(12*32), y3)
+	// y0=6, y1=4, y9=y4=2, y2=10, y3=12, y5=14, y6=22, y7=26, y8=30
+	Comment("4=4^28^2^10^18")
+	VPXOR(y1, y9, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(b.Offset(18*32), y4, y4)
+	VPXOR(b.Offset(28*32), y4, y4)
+	VMOVDQU(y4, buffer.Offset(4*32))
 
-	Comment("t13 = ^(t12 ^ p0)")
-	PXOR(t3, t12)
-	PANDN(f, t12) // l3 = t12 = ^(p0 ^ t3)
+	Comment("20=20^12^18^26^2")
+	VPXOR(b.Offset(20*32), y9, y4)
+	VPXOR(y3, y4, y4)
+	VPXOR(b.Offset(18*32), y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(20*32))
 
-	Comment("t14 = ^(t1 & p2)")
-	PAND(t1, t9)
-	PANDN(f, t9) // t14
+	Comment("28=28^20^26^2^10")
+	VPXOR(b.Offset(28*32), y9, y9)
+	VPXOR(b.Offset(20*32), y9, y9)
+	VPXOR(y7, y9, y9)
+	VPXOR(y2, y9, y9)
+	VMOVDQU(y9, buffer.Offset(28*32))
 
-	Comment("t15 = ^(t14 & t9)")
-	PAND(t9, t7)
-	PANDN(f, t7) // l1
+	VMOVDQU(b.Offset(20*32), y9)
+	// y0=6, y1=4, y9=20, y2=10, y3=12, y5=14, y6=22, y7=26, y8=30
 
-	Comment("start bottom function")
-	Comment("current register status: t11=l0, t7=l1, t6=l2, t12=l3")
-	Comment("k4 = l2 ^ l3")
-	MOVOU(t12, t5)
-	PXOR(t6, t5) // k4 = l2 ^ l3
-	Comment("k3 = l1 ^ l3")
-	MOVOU(t12, t4)
-	PXOR(t7, t4) // k3 = l1 ^ l3
-	Comment("k2 = l0 ^ l2")
-	MOVOU(t6, t3)
-	PXOR(t11, t3) // k2 = l0 ^ l2
-	Comment("k0 = l0 ^ l1")
-	MOVOU(t7, t1)
-	PXOR(t11, t1) // k0 = l0 ^ l1
-	Comment("k1 = k2 ^ k3")
-	MOVOU(t4, t2)
-	PXOR(t3, t2) // k1 = k2 ^ k3
+	Comment("6=6^30^4^12^20")
+	VPXOR(y0, y1, y4)
+	VPXOR(y3, y4, y4)
+	VPXOR(y8, y4, y4)
+	VPXOR(y9, y4, y4)
+	VMOVDQU(y4, buffer.Offset(6*32))
 
-	Comment("e0=^(m1 & k0)")
-	MOVOU(buffer.Offset((8+1)*16), t8) // m1
-	PAND(t1, t8)
-	PANDN(f, t8) // e0
+	Comment("12=12^4^10^18^26")
+	VPXOR(y3, y1, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(b.Offset(18*32), y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(12*32))
+
+	VMOVDQU(b.Offset(28*32), y7)
+	// y0=6, y1=4, y9=20, y2=10, y3=12, y5=14, y6=22, y7=28, y8=30
+	Comment("22=22^14^20^28^4")
+	VPXOR(y5, y1, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y9, y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(22*32))
 
-	Comment("e1=^(g5 & l1)")
-	MOVOU(buffer.Offset(5*16), t9)
-	PAND(t7, t9)
-	PANDN(f, t9) // e1
+	Comment("30=30^22^28^4^12")
+	VPXOR(y8, y1, y1)
+	VPXOR(y6, y1, y1)
+	VPXOR(y3, y1, y1)
+	VPXOR(y7, y1, y1)
+	VMOVDQU(y1, buffer.Offset(30*32))
 
-	Comment("r0=e0 ^ e1")
-	PXOR(t9, t8) // r0 = e0 ^ e1
+	Comment("14=14^6^12^20^28")
+	VPXOR(y3, y0, y0)
+	VPXOR(y7, y0, y0)
+	VPXOR(y9, y0, y0)
+	VPXOR(y5, y0, y0)
+	VMOVDQU(y0, buffer.Offset(14*32))
+
+	VMOVDQU(b.Offset(1*32), y0)
+	VMOVDQU(b.Offset(9*32), y1)
+	VMOVDQU(b.Offset(17*32), y2)
+	VMOVDQU(b.Offset(25*32), y3)
+	VMOVDQU(b.Offset(19*32), y5)
+	VMOVDQU(b.Offset(23*32), y6)
+	VMOVDQU(b.Offset(27*32), y7)
+	VMOVDQU(b.Offset(31*32), y8)
+	VMOVDQU(b.Offset(3*32), y9)
 
-	Comment("e2=^(g4 & l0)")
-	MOVOU(buffer.Offset(4*16), t10)
-	PAND(t11, t10)
-	PANDN(f, t10) // e2
-	Comment("r1=e2 ^ e1")
-	PXOR(t10, t9) // r1 = e2 ^ e1
+	Comment("1=1^25^15^23^31")
+	VPXOR(y3, y0, y4)
+	VPXOR(b.Offset(15*32), y4, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y8, y4, y4)
+	VMOVDQU(y4, buffer.Offset(1*32))
 
-	Comment("store r0 r1")
-	MOVOU(t8, buffer.Offset(22*16)) // in fact, we can start from 18*16
-	MOVOU(t9, buffer.Offset(23*16))
-
-	Comment("e3=^(m7 & k3)")
-	MOVOU(buffer.Offset((8+7)*16), t8) // m7
-	PAND(t4, t8)
-	PANDN(f, t8) // e3
-
-	Comment("e4=^(m5 & k2)")
-	MOVOU(buffer.Offset((8+5)*16), t9) // m5
-	PAND(t3, t9)
-	PANDN(f, t9) // e4
-	Comment("r2=e3 ^ e4")
-	PXOR(t9, t8) // r2 = e3 ^ e4
+	Comment("3=3^27^1^9^17")
+	VPXOR(y9, y0, y4)
+	VPXOR(y7, y4, y4)
+	VPXOR(y1, y4, y4)
+	VPXOR(y2, y4, y4)
+	VMOVDQU(y4, buffer.Offset(3*32))
 
-	Comment("e5=^(m3 & k1)")
-	MOVOU(buffer.Offset((8+3)*16), t10) // m3
-	PAND(t2, t10)
-	PANDN(f, t10) // e5
-	Comment("r3=e5 ^ e4")
-	PXOR(t10, t9) // r3 = e5 ^ e4
+	Comment("9=9^1^23^31^7")
+	VPXOR(y1, y0, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y8, y4, y4)
+	VPXOR(b.Offset(7*32), y4, y4)
+	VMOVDQU(y4, buffer.Offset(9*32))
 
-	Comment("store r2 r3")
-	MOVOU(t8, buffer.Offset(24*16))
-	MOVOU(t9, buffer.Offset(25*16))
-
-	Comment("e6=^(m9 & k4)")
-	MOVOU(buffer.Offset((8+9)*16), t8) // m9
-	PAND(t5, t8)
-	PANDN(f, t8) // e6
-
-	Comment("e7=^(g7 & l3)")
-	MOVOU(buffer.Offset(7*16), t9)
-	PAND(t12, t9)
-	PANDN(f, t9) // e7
-	Comment("r4=e7 ^ e6")
-	PXOR(t9, t8) // r4 = e6 ^ e7
+	Comment("19=1^19^11^17^25")
+	VPXOR(y5, y0, y4)
+	VPXOR(b.Offset(11*32), y4, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(y3, y4, y4)
+	VMOVDQU(y4, buffer.Offset(19*32))
 
-	Comment("e8=^(g6 & l2)")
-	MOVOU(buffer.Offset(6*16), t10)
-	PAND(t6, t10)
-	PANDN(f, t10) // e8
-	Comment("r5=e8 ^ e6")
-	PXOR(t10, t9) // r5 = e8 ^ e7
+	Comment("27=1^27^19^25^9")
+	VPXOR(y1, y0, y0)
+	VPXOR(y7, y0, y0)
+	VPXOR(y5, y0, y0)
+	VPXOR(y3, y0, y0)
+	VMOVDQU(y0, buffer.Offset(27*32))
 
-	Comment("store r4 r5")
-	MOVOU(t8, buffer.Offset(26*16))
-	MOVOU(t9, buffer.Offset(27*16))
+	Comment("11=11^3^9^17^25")
+	VPXOR(b.Offset(11*32), y9, y4)
+	VPXOR(y1, y4, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(y3, y4, y4)
+	VMOVDQU(y4, buffer.Offset(11*32))
+
+	VMOVDQU(b.Offset(7*32), y0)
+	VMOVDQU(b.Offset(15*32), y5)
+	Comment("17=17^9^31^7^15")
+	VPXOR(y2, y1, y1)
+	VPXOR(y8, y1, y1)
+	VPXOR(y0, y1, y1)
+	VPXOR(y5, y1, y1)
+	VMOVDQU(y1, buffer.Offset(17*32))
 
-	Comment("e9=^(m0 & k0)")
-	MOVOU(buffer.Offset((8+0)*16), t8) // m0
-	PAND(t1, t8)
-	PANDN(f, t8) // e9
+	Comment("25=25^17^7^15^23")
+	VPXOR(y3, y2, y2)
+	VPXOR(y0, y2, y2)
+	VPXOR(y5, y2, y2)
+	VPXOR(y6, y2, y2)
+	VMOVDQU(y2, buffer.Offset(25*32))
+
+	VMOVDQU(b.Offset(5*32), y1)
+	VMOVDQU(b.Offset(11*32), y2)
+	VMOVDQU(b.Offset(13*32), y3)
+	// y0=7, y1=5, y9=y4=3, y2=11, y3=13, y5=15, y6=23, y7=27, y8=31
+	Comment("5=5^29^3^11^19")
+	VPXOR(y1, y9, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(b.Offset(19*32), y4, y4)
+	VPXOR(b.Offset(29*32), y4, y4)
+	VMOVDQU(y4, buffer.Offset(5*32))
 
-	Comment("e10=^(g1 & l1)")
-	MOVOU(buffer.Offset(1*16), t1)
-	PAND(t7, t1)
-	PANDN(f, t1) // e10
+	Comment("21=21^13^19^27^3")
+	VPXOR(b.Offset(21*32), y9, y4)
+	VPXOR(y3, y4, y4)
+	VPXOR(b.Offset(19*32), y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(21*32))
 
-	Comment("r6=e9 ^ e10")
-	PXOR(t1, t8) // r6 = e9 ^ e10
+	Comment("29=29^21^27^3^11")
+	VPXOR(b.Offset(29*32), y9, y9)
+	VPXOR(b.Offset(21*32), y9, y9)
+	VPXOR(y7, y9, y9)
+	VPXOR(y2, y9, y9)
+	VMOVDQU(y9, buffer.Offset(29*32))
 
-	Comment("e11=^(g0 & l0)")
-	MOVOU(buffer, t10)
-	PAND(t11, t10)
-	PANDN(f, t10) // e11
-	Comment("r7=e11 ^ e10")
-	PXOR(t10, t1) // r7 = e11 ^ e10
-	Comment("store r6")
-	MOVOU(t8, buffer.Offset(28*16))
+	VMOVDQU(b.Offset(21*32), y9)
+	// y0=7, y1=5, y9=21, y2=11, y3=13, y5=15, y6=23, y7=27, y8=31
 
-	Comment("e12=^(m6 & k3)")
-	MOVOU(buffer.Offset((8+6)*16), t7) // m6
-	PAND(t4, t7)
-	PANDN(f, t7) // e12
+	Comment("7=7^31^5^13^21")
+	VPXOR(y0, y1, y4)
+	VPXOR(y3, y4, y4)
+	VPXOR(y8, y4, y4)
+	VPXOR(y9, y4, y4)
+	VMOVDQU(y4, buffer.Offset(7*32))
 
-	Comment("e13=^(m4 & k2)")
-	MOVOU(buffer.Offset((8+4)*16), t11) // m4
-	PAND(t3, t11)
-	PANDN(f, t11) // e13
+	Comment("13=13^5^11^19^27")
+	VPXOR(y3, y1, y4)
+	VPXOR(y2, y4, y4)
+	VPXOR(b.Offset(19*32), y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(13*32))
+
+	VMOVDQU(b.Offset(29*32), y7)
+	// y0=7, y1=5, y9=21, y2=11, y3=13, y5=15, y6=23, y7=29, y8=31
+	Comment("23=23^15^21^29^5")
+	VPXOR(y5, y1, y4)
+	VPXOR(y6, y4, y4)
+	VPXOR(y9, y4, y4)
+	VPXOR(y7, y4, y4)
+	VMOVDQU(y4, buffer.Offset(23*32))
 
-	Comment("r8=e12 ^ e13")
-	PXOR(t11, t7) // r8 = e12 ^ e13 = t7
+	Comment("31=31^23^29^5^13")
+	VPXOR(y8, y1, y1)
+	VPXOR(y6, y1, y1)
+	VPXOR(y3, y1, y1)
+	VPXOR(y7, y1, y1)
+	VMOVDQU(y1, buffer.Offset(31*32))
 
-	Comment("e14=^(m2 & k1)")
-	MOVOU(buffer.Offset((8+2)*16), t10) // m2
-	PAND(t2, t10)
-	PANDN(f, t10) // e14
+	Comment("15=15^7^13^21^29")
+	VPXOR(y3, y0, y0)
+	VPXOR(y7, y0, y0)
+	VPXOR(y9, y0, y0)
+	VPXOR(y5, y0, y0)
+	VMOVDQU(y0, buffer.Offset(15*32))
 
-	Comment("r9=e14 ^ e13")
-	PXOR(t10, t11) // r9 = e14 ^ e13  = t11
+	VZEROUPPER()
+	RET()
+}
 
-	Comment("e15=^(m8 & k4)")
-	MOVOU(buffer.Offset((8+8)*16), t8) // m8
-	PAND(t5, t8)
-	PANDN(f, t8) // e15
+func sbox256avx2() {
+	// sbox256avx2 function
+	TEXT("sbox256avx2", NOSPLIT, "func(x, buffer *byte)")
+	Doc("sbox256avx2, 256 bits per 'byte'")
 
-	Comment("e16=^(g3 & l3)")
-	MOVOU(buffer.Offset(3*16), t9)
-	PAND(t12, t9)
-	PANDN(f, t9) // e16
-	Comment("r10=e15 ^ e16")
-	PXOR(t9, t8) // r10 = e15 ^ e16 = t8
+	b := Mem{Base: Load(Param("x"), GP64())}
+	buffer := Mem{Base: Load(Param("buffer"), GP64())}
 
-	Comment("e17=^(g2 & l2)")
-	MOVOU(buffer.Offset(2*16), t10)
-	PAND(t6, t10)
-	PANDN(f, t10) // e17
+	Comment("f, for not operation")
+	f := YMM()
+	VPCMPEQB(f, f, f)
 
-	Comment("r11=e17 ^ e16")
-	PXOR(t10, t9) // r11 = e17 ^ e16 = t9
+	Comment("Start input function")
+	Comment("t1=b7 ^ b5")
+	t1 := YMM()
+	VMOVDQU(b.Offset(7*32), t1)
+	VPXOR(b.Offset(5*32), t1, t1)
 
-	Comment("start output function")
-	// t1 = r7
-	// t7 = r8
-	// t11 = r9
-	// t8 = r10
-	// t9 = r11
-	Comment("[t1]=r7 ^ r9")
-	PXOR(t1, t11) // t11 = r7 ^ r9
-	Comment("t2=t1 ^ r1")
-	MOVOU(buffer.Offset((22+1)*16), t2) // r1
-	PXOR(t11, t2)                       // t2 = r1 ^ t11
-	Comment("t3=t2 ^ r3")
-	MOVOU(buffer.Offset((22+3)*16), t3) // r3
-	MOVOU(t3, t4)
-	PXOR(t2, t3) // t3 = r3 ^ t2
-	Comment("t4=r5 ^ r3")
-	PXOR(buffer.Offset((22+5)*16), t4) // t4 = r5 ^ r3
+	t2, t7, t8 := YMM(), YMM(), YMM()
+	VMOVDQU(b.Offset(1*32), t8)
+	Comment("store m6=b1")
+	VMOVDQU(t8, buffer.Offset((8+6)*32)) // m6
+	Comment("t2=b5 ^ b1")
+	VPXOR(b.Offset(5*32), t8, t2)
+	VPANDN(f, t2, t2)
 
-	MOVOU(buffer.Offset((22+4)*16), t5) // r4
-	MOVOU(t5, t6)
-	Comment("t5=r4 ^ t4")
-	PXOR(t4, t5) // t5 = r4 ^ t4
-	Comment("t6=r0 ^ t4")
-	PXOR(buffer.Offset(22*16), t6) // t6 = r4 ^ r0
+	t3, t4 := YMM(), YMM()
 
-	Comment("[t7]=r11 ^ r7")
-	PXOR(t9, t1) // [t7] t1 = r7 ^ r11
-	Comment("[t8]=[t1] ^ t4")
-	PXOR(t11, t4) // [t8] t4 = t4 ^ t11
-	Comment("store t8")
-	MOVOU(t4, b.Offset(5*16))
-	Comment("[t9]=[t1] ^ t6")
-	PXOR(t6, t11) // [t9] t11 = t11 ^ t6
-	Comment("store t9")
-	MOVOU(t11, b.Offset(2*16))
+	VMOVDQU(b, t4)
+	Comment("t3=^(b0 ^ t2)")
+	VPXOR(t2, t4, t3)
+	VPANDN(f, t3, t3)
+	Comment("store g5=^b0")
+	VPANDN(f, t4, t4)
+	VMOVDQU(t4, buffer.Offset(5*32)) // g5
 
-	Comment("[t10]=r2 ^ t5")
-	PXOR(buffer.Offset((22+2)*16), t5) // [t10] t5 = r2 ^ t5
-	Comment("[t11]=r10 ^ r8")
-	PXOR(t8, t7) // [t11] t7 = rr8 ^ r10
-	Comment("store t11")
-	MOVOU(t7, b.Offset(3*16))
-	Comment("[t12]=^(t3 ^ [t11])")
-	PXOR(t3, t7) // t7 = t3 ^ [t11]
-	PANDN(f, t7) // [t12] t7 = ^(t3 ^ [t11])
-	Comment("store t12")
-	MOVOU(t7, b.Offset(1*16))
-	Comment("[t13]=[t10] ^ [t12]")
-	PXOR(t7, t5) // [t13] t5 = [t10] ^ [t12]
-	Comment("store t13")
-	MOVOU(t5, b.Offset(6*16))
+	Comment("t4=b6 ^ b2")
+	t12 := YMM()
+	VMOVDQU(b.Offset(6*32), t12)
+	VPXOR(b.Offset(2*32), t12, t4)
 
-	Comment("[t14]=^(t3 ^ [t7])")
-	PXOR(t3, t1)
-	PANDN(f, t1) // [t14]
-	Comment("store t14")
-	MOVOU(t1, b.Offset(4*16))
-	Comment("[t16]=t6 ^ [t14]")
-	PXOR(t6, t1) // [t16]
-	Comment("store t16")
-	MOVOU(t1, b)
+	Comment("t5=b3 ^ t3")
+	t5, t11 := YMM(), YMM()
+	VMOVDQU(b.Offset(3*32), t11)
+	VPXOR(t3, t11, t5)
 
-	Comment("[t15]=^(r10 ^ r6)")
-	PXOR(buffer.Offset((22+6)*16), t8)
-	PANDN(f, t8)
-	Comment("store t15")
-	MOVOU(t8, b.Offset(7*16))
+	Comment("t6=b4 ^ t1")
+	t6 := YMM()
+	VPXOR(b.Offset(4*32), t1, t6)
 
-	RET()
-}
+	Comment("t7=b1 ^ t5")
+	VPXOR(t5, t8, t7)
+	Comment("t8=b1 ^ t4")
+	VPXOR(t4, t8, t8)
 
-// 0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
-// 24 25 26 27 28 29 30 31 |  0  1  2  3  4  5  6  7 |  8  9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23
-// 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29
-// 22 23  8  9 10 11 12 13 | 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5
-// 30 31 16 17 18 19 20 21 |  6  7 24 25 26 27 28 29 | 14 15  0  1  2  3  4  5 | 22 23  8  9 10 11 12 13
-func l128() {
-	// l128 function
-	TEXT("l128", NOSPLIT, "func(x, buffer *byte)")
-	Doc("l128, 128 bits per 'byte'")
+	Comment("t9=t6 ^ t8")
+	t9 := YMM()
+	VPXOR(t8, t6, t9)
+	Comment("store m8")
+	VMOVDQU(t9, buffer.Offset((8+8)*32)) // m8
+	Comment("store g1")
+	VMOVDQU(t7, buffer.Offset(1*32)) // g1
+	Comment("store g3")
+	VMOVDQU(t5, buffer.Offset(3*32)) // g3
+	Comment("store g4")
+	VMOVDQU(t2, buffer.Offset(4*32)) // g4
+	Comment("store m0")
+	VMOVDQU(t6, buffer.Offset((8+0)*32)) // m0
+	Comment("store m1")
+	VMOVDQU(t3, buffer.Offset((8+1)*32)) // m1
+	Comment("store m2")
+	VMOVDQU(t8, buffer.Offset((8+2)*32)) // m2
+	Comment("store m4")
+	VMOVDQU(t4, buffer.Offset((8+4)*32)) // m4
 
-	b := Mem{Base: Load(Param("x"), GP64())}
-	buffer := Mem{Base: Load(Param("buffer"), GP64())}
+	Comment("t11=^(b3 ^ t1)")
+	VPXOR(t1, t11, t11)
+	VPANDN(f, t11, t11)
+	Comment("store m5, can reuse t1 now")
+	VMOVDQU(t11, buffer.Offset((8+5)*32)) // m5
 
-	X0, X1, X2, X3, X4, X5, X6, X7, X8, X9 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
+	Comment("t12=^(b6 ^ t9)")
+	VPXOR(t9, t12, t12)
+	VPANDN(f, t12, t12)
+	Comment("store m9, can reuse t7 t8 t9 now")
+	VMOVDQU(t12, buffer.Offset((8+9)*32)) // m9
 
-	MOVOU(b, X0)
-	MOVOU(b.Offset(8*16), X1)
-	MOVOU(b.Offset(16*16), X2)
-	MOVOU(b.Offset(24*16), X3)
-	MOVOU(b.Offset(18*16), X5)
-	MOVOU(b.Offset(22*16), X6)
-	MOVOU(b.Offset(26*16), X7)
-	MOVOU(b.Offset(30*16), X8)
-	MOVOU(b.Offset(2*16), X9)
+	Comment("t10=t6 ^ t7")
+	t10 := t7
+	VPXOR(t6, t10, t10)
+	Comment("store g0, can reuse t6 now")
+	VMOVDQU(t10, buffer) // g0
 
-	Comment("0=0^24^14^22^30")
-	MOVOU(X0, X4)
-	PXOR(X3, X4)
-	PXOR(b.Offset(14*16), X4)
-	PXOR(X6, X4)
-	PXOR(X8, X4)
-	MOVOU(X4, buffer)
+	Comment("t13=t4 ^ t10")
+	t13 := t10
+	VPXOR(t4, t13, t13)
+	Comment("store g2, can reuse t4 now")
+	VMOVDQU(t13, buffer.Offset(2*32)) // g2
 
-	Comment("2=0^2^26^8^16")
-	MOVOU(X0, X4)
-	PXOR(X9, X4)
-	PXOR(X7, X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	MOVOU(X4, buffer.Offset(2*16))
+	Comment("t14=t2 ^ t11")
+	t14 := t1
+	VPXOR(t2, t11, t14)
+	Comment("store g6, can reuse t2 now")
+	VMOVDQU(t14, buffer.Offset(6*32)) // g6
 
-	Comment("8=0^8^22^30^6")
-	MOVOU(X0, X4)
-	PXOR(X1, X4)
-	PXOR(X6, X4)
-	PXOR(X8, X4)
-	PXOR(b.Offset(6*16), X4)
-	MOVOU(X4, buffer.Offset(8*16))
+	Comment("t15=t12^t14")
+	t15 := t14
+	VPXOR(t12, t15, t15)
+	Comment("store g7")
+	VMOVDQU(t15, buffer.Offset(7*32)) // g7
 
-	Comment("18=0^18^10^16^24")
-	MOVOU(X0, X4)
-	PXOR(X5, X4)
-	PXOR(b.Offset(10*16), X4)
-	PXOR(X2, X4)
-	PXOR(X3, X4)
-	MOVOU(X4, buffer.Offset(18*16))
+	Comment("t16=t3 ^ t12")
+	t16 := t12
+	VPXOR(t3, t16, t16)
+	Comment("store m3")
+	VMOVDQU(t16, buffer.Offset((8+3)*32)) // m3
 
-	Comment("26=0^26^18^24^8")
-	PXOR(X1, X0)
-	PXOR(X7, X0)
-	PXOR(X5, X0)
-	PXOR(X3, X0)
-	MOVOU(X0, buffer.Offset(26*16))
+	Comment("t17=t11 ^ t16")
+	t17 := t16
+	VPXOR(t11, t17, t17)
+	Comment("store m7")
+	VMOVDQU(t17, buffer.Offset((8+7)*32)) // m7
 
-	Comment("10=10^2^8^16^24")
-	MOVOU(X9, X4)
-	PXOR(b.Offset(10*16), X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	PXOR(X3, X4)
-	MOVOU(X4, buffer.Offset(10*16))
+	Comment("Start top function")
+	Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8")
+	// t1 = g7
+	// t2 = g4
+	// t3 = m1
+	// t4 = m4
+	// t5 = g3
+	// t6 = m0
+	// t7 = g2
+	// t8 = m2
+	// t9 = m8
+	// t11 = m5
+	// t12 = m7
+	Comment("t2= (m0 & m1)")
+	VPAND(t6, t3, t3) // t2
 
-	MOVOU(b.Offset(6*16), X0)
-	MOVOU(b.Offset(14*16), X5)
-	Comment("16=16^8^30^6^14")
-	PXOR(X2, X1)
-	PXOR(X8, X1)
-	PXOR(X0, X1)
-	PXOR(X5, X1)
-	MOVOU(X1, buffer.Offset(16*16))
+	Comment("t3= (g0 & g4)")
+	VPAND(buffer, t2, t2) // t3
 
-	Comment("24=24^16^6^14^22")
-	PXOR(X3, X2)
-	PXOR(X0, X2)
-	PXOR(X5, X2)
-	PXOR(X6, X2)
-	MOVOU(X2, buffer.Offset(24*16))
+	Comment("t4= (g3 & g7)")
+	VPAND(t5, t1, t6) // [t4] := t6
 
-	MOVOU(b.Offset(4*16), X1)
-	MOVOU(b.Offset(10*16), X2)
-	MOVOU(b.Offset(12*16), X3)
-	// X0=6, X1=4, X9=X4=2, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30
-	Comment("4=4^28^2^10^18")
-	MOVOU(X9, X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	PXOR(b.Offset(18*16), X4)
-	PXOR(b.Offset(28*16), X4)
-	MOVOU(X4, buffer.Offset(4*16))
+	Comment("t7= (g3 | g7)")
+	VPOR(t1, t5, t5) // [t7] := t5
 
-	Comment("20=20^12^18^26^2")
-	MOVOU(X9, X4)
-	PXOR(b.Offset(20*16), X4)
-	PXOR(X3, X4)
-	PXOR(b.Offset(18*16), X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(20*16))
+	Comment("t11= (m4 & m5)")
+	VPAND(t4, t11, t11)
 
-	Comment("28=28^20^26^2^10")
-	PXOR(b.Offset(28*16), X9)
-	PXOR(b.Offset(20*16), X9)
-	PXOR(X7, X9)
-	PXOR(X2, X9)
-	MOVOU(X9, buffer.Offset(28*16))
+	VMOVDQU(buffer.Offset((8+3)*32), t4) // t4 = m3
+	Comment("t10= ( m3 & m2 )")
+	VPAND(t8, t4, t1) // [t10] := t1
+	Comment("t12= ( m3 | m2 )")
+	VPOR(t8, t4, t4) //[t12] := t4
 
-	MOVOU(b.Offset(20*16), X9)
-	// X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30
+	Comment("t6= ( g6 | g2 )")
+	VPOR(buffer.Offset(6*32), t7, t7) // [t6] := t7
 
-	Comment("6=6^30^4^12^20")
-	MOVOU(X1, X4)
-	PXOR(X0, X4)
-	PXOR(X3, X4)
-	PXOR(X8, X4)
-	PXOR(X9, X4)
-	MOVOU(X4, buffer.Offset(6*16))
+	Comment("t9= ( m6 | m7 )")
+	VPOR(buffer.Offset((8+6)*32), t12, t12) // [t9] := t12
+
+	t10 = YMM()
+	VMOVDQU(buffer.Offset((8+9)*32), t10) // t10 = m9
+
+	Comment("t5= ( m8 & m9 )")
+	VPAND(t9, t10, t8) // [t5] := t8
+	Comment("t8= ( m8 | m9 )")
+	VPOR(t9, t10, t10) // [t8] := t10
+
+	Comment("t14 = t3 ^ t2")
+	VPXOR(t3, t2, t2) // t14 = t3 ^ t2
+	Comment("t16 = t5 ^ t14")
+	VPXOR(t2, t8, t8) // t16 = t5 ^ t14, can reuse t2 now
+	Comment("t20 = t16 ^ t7")
+	VPXOR(t8, t5, t5) // t20 = t16 ^ t7
+	Comment("t17 = t9 ^ t10")
+	VPXOR(t12, t1, t1) // t17 = t9 ^ t10
+	Comment("t18 = t11 ^ t12")
+	VPXOR(t11, t4, t4) // t18 = t11 ^ t12
+	Comment("p2 = t20 ^ t18")
+	VPXOR(t5, t4, t4) // p2 = t20 ^ t18, can reuse t5 now
+	Comment("p0 = t6 ^ t16")
+	VPXOR(t7, t8, t8) // p0 = t6 ^ t16
+	Comment("t1 = (g5 & g1)")
+	VMOVDQU(buffer.Offset(1*32), t2)
+	VMOVDQU(buffer.Offset(5*32), t5)
+	VPAND(t2, t5, t5)
+	Comment("t13 = t1 ^ t2")
+	VPXOR(t5, t3, t3) // t13 = t1 ^ t2
+	Comment("t15 = t13 ^ t4")
+	VPXOR(t6, t3, t3) // t15 = t4 ^ t13
+	Comment("t19 = t6 ^ t15")
+	VPXOR(t3, t7, t7) // t19 = t6 ^ t15
+	Comment("p3 = t19 ^ t17")
+	VPXOR(t1, t7, t7) // p3 = t19 ^ t17
+	Comment("p1 = t8 ^ t15")
+	VPXOR(t10, t3, t3) // p1 = t8 ^ t15
 
-	Comment("12=12^4^10^18^26")
-	MOVOU(X1, X4)
-	PXOR(X3, X4)
-	PXOR(X2, X4)
-	PXOR(b.Offset(18*16), X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(12*16))
+	Comment("start middle function")
+	Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0")
 
-	MOVOU(b.Offset(28*16), X7)
-	// X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=28, X8=30
-	Comment("22=22^14^20^28^4")
-	MOVOU(X1, X4)
-	PXOR(X5, X4)
-	PXOR(X6, X4)
-	PXOR(X9, X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(22*16))
+	// t3 = p1
+	// t4 = p2
+	// t7 = p3
+	// t8 = p0
+	Comment("t0 = (p1 & p2)")
+	VPAND(t3, t4, t1)
+	Comment("t1 = (p3 & p0)")
+	VPAND(t7, t8, t2)
+	Comment("t2 = (p0 & p2)")
+	VPAND(t4, t8, t5)
+	Comment("t3 = (p1 & p3)")
+	VPAND(t3, t7, t6)
+	Comment("t4 = (t0 & t2)")
+	VPAND(t1, t5, t9)
+	Comment("t5 = (t1 ^ t3)")
+	VPXOR(t2, t6, t10)
+	Comment("t6 = (t5 | p0)")
+	VPOR(t10, t8, t8)
+	Comment("t7 = (t2 | p3)")
+	VPOR(t5, t7, t7)
+	Comment("t8 = (t4 ^ t6)")
+	VPXOR(t9, t8, t8) // l3
+	Comment("t9 = (t7 ^ t3)")
+	VPXOR(t6, t7, t9)
+	Comment("t10 = (t0 ^ t9)")
+	VPXOR(t1, t9, t1) // l0
+	Comment("t11 = (t5 | p2)")
+	VPOR(t4, t10, t4)
+	Comment("l1 = t11 ^ t1")
+	VPXOR(t2, t4, t2) // l1
+	Comment("t12 = (t2 | p1)")
+	VPOR(t5, t3, t3)
+	Comment("l2 = t12 ^ t5")
+	VPXOR(t3, t10, t3) // l2
 
-	Comment("30=30^22^28^4^12")
-	PXOR(X8, X1)
-	PXOR(X6, X1)
-	PXOR(X3, X1)
-	PXOR(X7, X1)
-	MOVOU(X1, buffer.Offset(30*16))
+	Comment("start bottom function")
+	Comment("current register status: t1=l0, t2=l1, t3=l2, t8=l3")
+	Comment("k4 = l2 ^ l3")
+	VPXOR(t3, t8, t7) // k4 = l2 ^ l3
+	Comment("k3 = l1 ^ l3")
+	VPXOR(t8, t2, t6) // k3 = l1 ^ l3
+	Comment("k2 = l0 ^ l2")
+	VPXOR(t1, t3, t5) // k2 = l0 ^ l2
+	Comment("k0 = l0 ^ l1")
+	VPXOR(t1, t2, t4) // k0 = l0 ^ l1
+	Comment("k1 = k2 ^ k3")
+	VPXOR(t5, t6, t9) // k1 = k2 ^ k3
+
+	Comment("e0= (m1 & k0)")
+	VMOVDQU(buffer.Offset((8+1)*32), t10) // m1
+	VPAND(t4, t10, t10)                   // e0
+	Comment("e1= (g5 & l1)")
+	VMOVDQU(buffer.Offset(5*32), t11)
+	VPAND(t2, t11, t11) // e1
+	Comment("r0=e0 ^ e1")
+	VPXOR(t11, t10, t10) // r0 = e0 ^ e1
+	Comment("e2=(g4 & l0)")
+	VMOVDQU(buffer.Offset(4*32), t12)
+	VPAND(t1, t12, t12) // e2
+	Comment("r1=e2 ^ e1")
+	VPXOR(t12, t11, t11) // r1 = e2 ^ e1
+	Comment("store r0 r1")
+	VMOVDQU(t10, buffer.Offset(22*32)) // in fact, we can start from 18*32
+	VMOVDQU(t11, buffer.Offset(23*32))
+
+	Comment("e3= (m7 & k3)")
+	VMOVDQU(buffer.Offset((8+7)*32), t10) // m7
+	VPAND(t6, t10, t10)                   // e3
+	Comment("e4= (m5 & k2)")
+	VMOVDQU(buffer.Offset((8+5)*32), t11) // m5
+	VPAND(t5, t11, t11)
+	Comment("r2=e3 ^ e4")
+	VPXOR(t11, t10, t10) // r2 = e3 ^ e4
+	Comment("e5= (m3 & k1)")
+	VMOVDQU(buffer.Offset((8+3)*32), t12) // m3
+	VPAND(t9, t12, t12)
+	Comment("r3=e5 ^ e4")
+	VPXOR(t12, t11, t11) // r3 = e5 ^ e4
+	Comment("store r2 r3")
+	VMOVDQU(t10, buffer.Offset(24*32))
+	VMOVDQU(t11, buffer.Offset(25*32))
+
+	Comment("e6=(m9 & k4)")
+	VMOVDQU(buffer.Offset((8+9)*32), t10) // m9
+	VPAND(t7, t10, t10)
+	Comment("e7=(g7 & l3)")
+	VMOVDQU(buffer.Offset(7*32), t11)
+	VPAND(t8, t11, t11)
+	Comment("r4=e7 ^ e6")
+	VPXOR(t11, t10, t10) // r4 = e6 ^ e7
+	Comment("e8=(g6 & l2)")
+	VMOVDQU(buffer.Offset(6*32), t12)
+	VPAND(t3, t12, t12) // e8
+	Comment("r5=e8 ^ e6")
+	VPXOR(t12, t11, t12) // r5 = e8 ^ e7 = t12
+	Comment("store r4")
+	VMOVDQU(t10, buffer.Offset(26*32))
+
+	Comment("e9=(m0 & k0)")
+	VMOVDQU(buffer.Offset((8+0)*32), t10) // m0
+	VPAND(t4, t10, t10)
+	Comment("e10=(g1 & l1)")
+	VMOVDQU(buffer.Offset(1*32), t4)
+	VPAND(t4, t2, t2) // e10
+	Comment("r6=e9 ^ e10")
+	VPXOR(t10, t2, t10) // r6 = e9 ^ e10 = t10
+	Comment("e11=(g0 & l0)")
+	VMOVDQU(buffer, t11)
+	VPAND(t1, t11, t11)
+	Comment("r7=e11 ^ e10")
+	VPXOR(t11, t2, t1) // r7 = e11 ^ e10 = t1
+
+	Comment("e12=(m6 & k3)")
+	VMOVDQU(buffer.Offset((8+6)*32), t2) // m6
+	VPAND(t2, t6, t2)                    // e12
+	Comment("e13=(m4 & k2)")
+	VMOVDQU(buffer.Offset((8+4)*32), t6) // m4
+	VPAND(t6, t5, t5)                    // e13
+	Comment("r8=e12 ^ e13")
+	VPXOR(t2, t5, t2) // r8 = e12 ^ e13 = t2
+	Comment("e14=(m2 & k1)")
+	VMOVDQU(buffer.Offset((8+2)*32), t6) // m2
+	VPAND(t6, t9, t6)
+	Comment("r9=e14 ^ e13")
+	VPXOR(t5, t6, t4) // r9 = e14 ^ e13  = t4
+
+	Comment("e15=(m8 & k4)")
+	VMOVDQU(buffer.Offset((8+8)*32), t9) // m8
+	VPAND(t9, t7, t9)                    // e15
+	Comment("e16=(g3 & l3)")
+	VMOVDQU(buffer.Offset(3*32), t7)
+	VPAND(t7, t8, t8) // e16
+	Comment("r10=e15 ^ e16")
+	VPXOR(t9, t8, t5) // r10 = e15 ^ e16 = t5
+	Comment("e17=(g2 & l2)")
+	VMOVDQU(buffer.Offset(2*32), t9)
+	VPAND(t3, t9, t3) // e17
+	Comment("r11=e17 ^ e16")
+	VPXOR(t3, t8, t3) // r11 = e17 ^ e16 = t3
 
-	Comment("14=14^6^12^20^28")
-	PXOR(X3, X0)
-	PXOR(X7, X0)
-	PXOR(X9, X0)
-	PXOR(X5, X0)
-	MOVOU(X0, buffer.Offset(14*16))
+	Comment("start output function")
+	// t12 = r5
+	// t10 = r6
+	// t1 = r7
+	// t2 = r8
+	// t4 = r9
+	// t5 = r10
+	// t3 = r11
+	Comment("[t1]=r7 ^ r9")
+	VPXOR(t1, t4, t4) // [t1] = t4 = r7 ^ r9
 
-	MOVOU(b.Offset(1*16), X0)
-	MOVOU(b.Offset(9*16), X1)
-	MOVOU(b.Offset(17*16), X2)
-	MOVOU(b.Offset(25*16), X3)
-	MOVOU(b.Offset(19*16), X5)
-	MOVOU(b.Offset(23*16), X6)
-	MOVOU(b.Offset(27*16), X7)
-	MOVOU(b.Offset(31*16), X8)
-	MOVOU(b.Offset(3*16), X9)
+	Comment("[t2]=t1 ^ r1")
+	VMOVDQU(buffer.Offset((22+1)*32), t6) // r1
+	VPXOR(t4, t6, t6)                     // [t2] = t6 = r1 ^ [t1]
 
-	Comment("1=1^25^15^23^31")
-	MOVOU(X0, X4)
-	PXOR(X3, X4)
-	PXOR(b.Offset(15*16), X4)
-	PXOR(X6, X4)
-	PXOR(X8, X4)
-	MOVOU(X4, buffer.Offset(1*16))
+	Comment("[t3]=t2 ^ r3")
+	VMOVDQU(buffer.Offset((22+3)*32), t8) // r3
+	VPXOR(t6, t8, t7)                     // [t3] = t7 = r3 ^ [t2]
+	Comment("[t4]=r5 ^ r3")
+	VPXOR(t12, t8, t8) // [t4] = t8 = r5 ^ r3
 
-	Comment("3=3^27^1^9^17")
-	MOVOU(X0, X4)
-	PXOR(X9, X4)
-	PXOR(X7, X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	MOVOU(X4, buffer.Offset(3*16))
+	Comment("[t5]=r4 ^ [t4]")
+	VMOVDQU(buffer.Offset((22+4)*32), t11) // r4
+	VPXOR(t8, t11, t9)                     // [t5] = t9 = r4 ^ t4
+	Comment("[t6]=r0 ^ r4")
+	VPXOR(buffer.Offset(22*32), t11, t11) // [t6] = t11 = r4 ^ r0
 
-	Comment("9=9^1^23^31^7")
-	MOVOU(X0, X4)
-	PXOR(X1, X4)
-	PXOR(X6, X4)
-	PXOR(X8, X4)
-	PXOR(b.Offset(7*16), X4)
-	MOVOU(X4, buffer.Offset(9*16))
+	Comment("[t7]=r11 ^ r7")
+	VPXOR(t3, t1, t1) // [t7] t1 = r7 ^ r11
 
-	Comment("19=1^19^11^17^25")
-	MOVOU(X0, X4)
-	PXOR(X5, X4)
-	PXOR(b.Offset(11*16), X4)
-	PXOR(X2, X4)
-	PXOR(X3, X4)
-	MOVOU(X4, buffer.Offset(19*16))
+	Comment("[t8]=[t1] ^ [t4]")
+	VPXOR(t4, t8, t8) // t8 = t4 ^ t11
+	Comment("store t8")
+	VMOVDQU(t8, b.Offset(5*32))
 
-	Comment("27=1^27^19^25^9")
-	PXOR(X1, X0)
-	PXOR(X7, X0)
-	PXOR(X5, X0)
-	PXOR(X3, X0)
-	MOVOU(X0, buffer.Offset(27*16))
+	Comment("[t9]=[t1] ^ [t6]")
+	VPXOR(t11, t4, t4) // [t9] = t4
+	Comment("store t9")
+	VMOVDQU(t4, b.Offset(2*32))
 
-	Comment("11=11^3^9^17^25")
-	MOVOU(X9, X4)
-	PXOR(b.Offset(11*16), X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	PXOR(X3, X4)
-	MOVOU(X4, buffer.Offset(11*16))
+	Comment("[t10]=r2 ^ t5")
+	VPXOR(buffer.Offset((22+2)*32), t9, t9) // [t10] t9 = r2 ^ [t5]
+	Comment("[t11]=r10 ^ r8")
+	VPXOR(t5, t2, t2) // [t11] = t2
+	Comment("store t11")
+	VMOVDQU(t2, b.Offset(3*32))
+	Comment("[t12]=^([t3] ^ [t11])")
+	VPXOR(t7, t2, t2)
+	VPANDN(f, t2, t2) // [t12] = t2
+	Comment("store t12")
+	VMOVDQU(t2, b.Offset(1*32))
+	Comment("[t13]=[t10] ^ [t12]")
+	VPXOR(t2, t9, t9) // [t13] = t9
+	Comment("store t13")
+	VMOVDQU(t9, b.Offset(6*32))
 
-	MOVOU(b.Offset(7*16), X0)
-	MOVOU(b.Offset(15*16), X5)
-	Comment("17=17^9^31^7^15")
-	PXOR(X2, X1)
-	PXOR(X8, X1)
-	PXOR(X0, X1)
-	PXOR(X5, X1)
-	MOVOU(X1, buffer.Offset(17*16))
+	Comment("[t14]=^([t3] ^ [t7])")
+	VPXOR(t7, t1, t1)
+	VPANDN(f, t1, t1) // [t14]
+	Comment("store t14")
+	VMOVDQU(t1, b.Offset(4*32))
+	Comment("[t16]=[t6] ^ [t14]")
+	VPXOR(t11, t1, t1) // [t16]
+	Comment("store t16")
+	VMOVDQU(t1, b)
 
-	Comment("25=25^17^7^15^23")
-	PXOR(X3, X2)
-	PXOR(X0, X2)
-	PXOR(X5, X2)
-	PXOR(X6, X2)
-	MOVOU(X2, buffer.Offset(25*16))
+	Comment("[t15]=^(r10 ^ r6)")
+	VPXOR(t10, t5, t5)
+	VPANDN(f, t5, t5)
+	Comment("store t15")
+	VMOVDQU(t5, b.Offset(7*32))
 
-	MOVOU(b.Offset(5*16), X1)
-	MOVOU(b.Offset(11*16), X2)
-	MOVOU(b.Offset(13*16), X3)
-	// X0=7, X1=5, X9=X4=3, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31
-	Comment("5=5^29^3^11^19")
-	MOVOU(X9, X4)
-	PXOR(X1, X4)
-	PXOR(X2, X4)
-	PXOR(b.Offset(19*16), X4)
-	PXOR(b.Offset(29*16), X4)
-	MOVOU(X4, buffer.Offset(5*16))
+	VZEROUPPER()
+	RET()
+}
 
-	Comment("21=21^13^19^27^3")
-	MOVOU(X9, X4)
-	PXOR(b.Offset(21*16), X4)
-	PXOR(X3, X4)
-	PXOR(b.Offset(19*16), X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(21*16))
+func xorRoundKey256avx2() {
+	// xorRoundKey256avx2 function
+	TEXT("xorRoundKey256avx2", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)")
+	Doc("xor x1, x2, x3 with round key, 32 bytes per bit")
 
-	Comment("29=29^21^27^3^11")
-	PXOR(b.Offset(29*16), X9)
-	PXOR(b.Offset(21*16), X9)
-	PXOR(X7, X9)
-	PXOR(X2, X9)
-	MOVOU(X9, buffer.Offset(29*16))
+	x := Load(Param("rk"), GP32())
+	x1 := Mem{Base: Load(Param("x1"), GP64())}
+	x2 := Mem{Base: Load(Param("x2"), GP64())}
+	x3 := Mem{Base: Load(Param("x3"), GP64())}
+	out := Mem{Base: Load(Param("out"), GP64())}
 
-	MOVOU(b.Offset(21*16), X9)
-	// X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31
+	ret := YMM()
+	one := YMM()
+	VPCMPEQB(one, one, one)
 
-	Comment("7=7^31^5^13^21")
-	MOVOU(X1, X4)
-	PXOR(X0, X4)
-	PXOR(X3, X4)
-	PXOR(X8, X4)
-	PXOR(X9, X4)
-	MOVOU(X4, buffer.Offset(7*16))
+	y := GP32()
 
-	Comment("13=13^5^11^19^27")
-	MOVOU(X1, X4)
-	PXOR(X3, X4)
-	PXOR(X2, X4)
-	PXOR(b.Offset(19*16), X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(13*16))
+	count := GP64()
+	XORQ(count, count)
+	Comment("Handle first byte")
+	MOVL(U32(0x01000000), y)
+	Label("rk_loop_1")
+	VMOVDQU(x1.Idx(count, 1), ret)
+	VPXOR(x2.Idx(count, 1), ret, ret)
+	VPXOR(x3.Idx(count, 1), ret, ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_1_c"))
+	VPXOR(one, ret, ret)
+	Label("rk_loop_1_c")
+	VMOVDQU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(32), count)
+	CMPQ(count, U32(256))
+	JL(LabelRef("rk_loop_1"))
 
-	MOVOU(b.Offset(29*16), X7)
-	// X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=29, X8=31
-	Comment("23=23^15^21^29^5")
-	MOVOU(X1, X4)
-	PXOR(X5, X4)
-	PXOR(X6, X4)
-	PXOR(X9, X4)
-	PXOR(X7, X4)
-	MOVOU(X4, buffer.Offset(23*16))
+	Comment("Handle second byte")
+	MOVL(U32(0x00010000), y)
+	Label("rk_loop_2")
+	VMOVDQU(x1.Idx(count, 1), ret)
+	VPXOR(x2.Idx(count, 1), ret, ret)
+	VPXOR(x3.Idx(count, 1), ret, ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_2_c"))
+	VPXOR(one, ret, ret)
+	Label("rk_loop_2_c")
+	VMOVDQU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(32), count)
+	CMPQ(count, U32(512))
+	JL(LabelRef("rk_loop_2"))
 
-	Comment("31=31^23^29^5^13")
-	PXOR(X8, X1)
-	PXOR(X6, X1)
-	PXOR(X3, X1)
-	PXOR(X7, X1)
-	MOVOU(X1, buffer.Offset(31*16))
+	Comment("Handle third byte")
+	MOVL(U32(0x00000100), y)
+	Label("rk_loop_3")
+	VMOVDQU(x1.Idx(count, 1), ret)
+	VPXOR(x2.Idx(count, 1), ret, ret)
+	VPXOR(x3.Idx(count, 1), ret, ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_3_c"))
+	VPXOR(one, ret, ret)
+	Label("rk_loop_3_c")
+	VMOVDQU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(32), count)
+	CMPQ(count, U32(768))
+	JL(LabelRef("rk_loop_3"))
 
-	Comment("15=15^7^13^21^29")
-	PXOR(X3, X0)
-	PXOR(X7, X0)
-	PXOR(X9, X0)
-	PXOR(X5, X0)
-	MOVOU(X0, buffer.Offset(15*16))
+	Comment("Handle last byte")
+	MOVL(U32(0x00000001), y)
+	Label("rk_loop_4")
+	VMOVDQU(x1.Idx(count, 1), ret)
+	VPXOR(x2.Idx(count, 1), ret, ret)
+	VPXOR(x3.Idx(count, 1), ret, ret)
+	TESTL(x, y)
+	JZ(LabelRef("rk_loop_4_c"))
+	VPXOR(one, ret, ret)
+	Label("rk_loop_4_c")
+	VMOVDQU(ret, out.Idx(count, 1))
+	ROLL(U8(1), y)
+	ADDQ(U8(32), count)
+	CMPQ(count, U32(1024))
+	JL(LabelRef("rk_loop_4"))
 
 	RET()
 }
@@ -2019,11 +3562,17 @@ func main() {
 	transpose128Rev()
 	transpose128avx(flipMask)
 	transpose128RevAvx(flipMask)
+	transpose256avx(flipMask)
+	transpose128x256avx2(flipMask)
+	transpose256RevAvx(flipMask)
 	xor32x128()
 	xor32x128avx()
 	xorRoundKey128()
 	sbox128()
 	l128()
+	l256()
+	sbox256avx2()
+	xorRoundKey256avx2()
 
 	Generate()
 }
diff --git a/bs128.go b/bs128.go
index f2cc2a6..127da66 100644
--- a/bs128.go
+++ b/bs128.go
@@ -20,7 +20,7 @@ func (bs bs128) tao(x, buffer []byte) []byte {
 }
 
 func (bs bs128) xor32(x1, x2 []byte) []byte {
-	xor32x128avx(&x1[0], &x2[0], &x1[0])
+	xor32x128avx(32*bs.bytes(), &x1[0], &x2[0], &x1[0])
 	return x1
 }
 
diff --git a/bs128_test.go b/bs128_test.go
index d978198..e6a1604 100644
--- a/bs128_test.go
+++ b/bs128_test.go
@@ -94,20 +94,16 @@ func BenchmarkTao128(b *testing.B) {
 }
 
 func TestL128(t *testing.T) {
-	x := make([]byte, 32*BS128.bytes())
 	buffer := make([]byte, 32*BS128.bytes())
-	b := 0x00010203 ^ bits.RotateLeft32(0x00010203, 2) ^ bits.RotateLeft32(0x00010203, 10) ^ bits.RotateLeft32(0x00010203, 18) ^ bits.RotateLeft32(0x00010203, 24)
+	b := uint32(0xe0e7eef5) ^ bits.RotateLeft32(0xe0e7eef5, 2) ^ bits.RotateLeft32(0xe0e7eef5, 10) ^ bits.RotateLeft32(0xe0e7eef5, 18) ^ bits.RotateLeft32(0xe0e7eef5, 24)
 	expected := newUint32x128(b)
 
-	copy(x, newByte128(byte(0)))
-	copy(x[8*BS128.bytes():], newByte128(byte(1)))
-	copy(x[16*BS128.bytes():], newByte128(byte(2)))
-	copy(x[24*BS128.bytes():], newByte128(byte(3)))
+	x := newUint32x128(0xe0e7eef5)
 
 	ret := BS128.l(x, buffer)
 
 	if !bytes.Equal(ret, expected) {
-		t.Fatalf("unexpected l128 result, expected %x, got %x", expected, ret)
+		t.Fatalf("unexpected l256 result, expected %x, got %x", expected, ret)
 	}
 }
 
diff --git a/bs_amd64.go b/bs_amd64.go
index 5d8a25e..ed77520 100644
--- a/bs_amd64.go
+++ b/bs_amd64.go
@@ -22,11 +22,20 @@ func transpose128avx(in *byte, out *byte)
 // Bit level matrix transpose, b0-b1-b2-b3, 128x128
 func transpose128RevAvx(in *byte, out *byte)
 
+// Bit level matrix transpose, 256x128 => 128x256
+func transpose256avx(in *byte, out *byte)
+
+// Bit level matrix transpose, 128x256 => 256x128, just for test here.
+func transpose128x256avx2(in *byte, out *byte)
+
+// Bit level matrix transpose, b0-b1-b2-b3, 128x256
+func transpose256RevAvx(in *byte, out *byte)
+
 // out = x xor y
 func xor32x128(x *byte, y *byte, out *byte)
 
 // out = x xor y
-func xor32x128avx(x *byte, y *byte, out *byte)
+func xor32x128avx(len int, x *byte, y *byte, out *byte)
 
 // xor x1, x2, x3 with round key, 16 bytes per bit
 func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte)
@@ -36,3 +45,12 @@ func sbox128(x *byte, buffer *byte)
 
 // l128, 128 bits per 'byte'
 func l128(x *byte, buffer *byte)
+
+// l256, 256 bits per 'byte'
+func l256(x *byte, buffer *byte)
+
+// sbox256avx2, 256 bits per 'byte'
+func sbox256avx2(x *byte, buffer *byte)
+
+// xor x1, x2, x3 with round key, 32 bytes per bit
+func xorRoundKey256avx2(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte)
diff --git a/bs_amd64.s b/bs_amd64.s
index 1850558..556e139 100644
--- a/bs_amd64.s
+++ b/bs_amd64.s
@@ -3106,946 +3106,4023 @@ col_loop_b0:
 	VZEROUPPER
 	RET
 
+// func transpose256avx(in *byte, out *byte)
+// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3
+TEXT ·transpose256avx(SB), NOSPLIT, $0-16
+	MOVQ in+0(FP), AX
+	MOVQ out+8(FP), CX
+
+	// Initialize rr, current row
+	XORQ SI, SI
+
+row_loop:
+	// Initialize cc, current col
+	XORQ BX, BX
+
+col_loop:
+	// Initialize (rr * ncols + cc) / 8, here ncols=128
+	MOVQ SI, DI
+
+	// Multiple with ncols
+	SHLQ $0x07, DI
+	ADDQ BX, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of first 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x10, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x10, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x08, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x20, DI
+	ADDQ        $0x08, BX
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x08, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x20, DI
+	ADDQ        $0x08, BX
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x08, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x20, DI
+	ADDQ        $0x08, BX
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x08, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x20, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x20, DI
+	ADDQ $0x08, BX
+
+	// Compare cc with ncols, here ncols=128
+	CMPQ BX, $0x80
+	JL   col_loop
+	ADDQ $0x20, SI
+
+	// Compare rr with nrows, here nrows=256
+	CMPQ SI, $0x00000100
+	JL   row_loop
+	VZEROUPPER
+	RET
+
+// func transpose128x256avx2(in *byte, out *byte)
+// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3
+TEXT ·transpose128x256avx2(SB), NOSPLIT, $0-16
+	MOVQ in+0(FP), AX
+	MOVQ out+8(FP), CX
+
+	// Initialize rr, current row
+	XORQ SI, SI
+
+row_loop:
+	// Initialize cc, current col
+	XORQ BX, BX
+
+col_loop:
+	// Initialize (rr * ncols + cc) / 8, here ncols=256
+	MOVQ SI, DI
+
+	// Multiple with ncols
+	SHLQ $0x08, DI
+	ADDQ BX, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of first 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x07, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, BX
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x07, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, BX
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x07, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, BX
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ BX, DI
+	ADDQ $0x07, DI
+
+	// Multiple with nrows
+	SHLQ $0x07, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x10, DI
+	ADDQ $0x08, BX
+
+	// Compare cc with ncols, here ncols=256
+	CMPQ BX, $0x00000100
+	JL   col_loop
+	ADDQ $0x20, SI
+
+	// Compare rr with nrows, here nrows=128
+	CMPQ SI, $0x00000080
+	JL   row_loop
+	VZEROUPPER
+	RET
+
+// func transpose256RevAvx(in *byte, out *byte)
+// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3
+TEXT ·transpose256RevAvx(SB), NOSPLIT, $0-16
+	MOVQ in+0(FP), AX
+	MOVQ out+8(FP), CX
+
+	// Initialize rr, current row, 96
+	XORQ BX, BX
+
+	// Initialize cc, current col
+	XORQ SI, SI
+
+col_loop_b3:
+	// Initialize (rr * ncols + cc) / 8, here ncols=256
+	MOVQ $0x00006000, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of the 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x10, DI
+	ADDQ $0x08, SI
+
+	// Compare cc with ncols, here ncols=256
+	CMPQ SI, $0x00000100
+	JL   col_loop_b3
+	ADDQ $0x20, BX
+
+	// Initialize cc, current col
+	XORQ SI, SI
+
+col_loop_b2:
+	// Initialize (rr * ncols + cc) / 8, here ncols=256
+	MOVQ $0x00004000, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of the 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x04, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x10, DI
+	ADDQ $0x08, SI
+
+	// Compare cc with ncols, here ncols=256
+	CMPQ SI, $0x00000100
+	JL   col_loop_b2
+	ADDQ $0x20, BX
+
+	// Initialize cc, current col
+	XORQ SI, SI
+
+col_loop_b1:
+	// Initialize (rr * ncols + cc) / 8, here ncols=256
+	MOVQ $0x00002000, DI
+	ADDQ SI, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of the 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x08, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x08, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x08, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x08, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x10, DI
+	ADDQ $0x08, SI
+
+	// Compare cc with ncols, here ncols=256
+	CMPQ SI, $0x00000100
+	JL   col_loop_b1
+	ADDQ $0x20, BX
+
+	// Initialize cc, current col
+	XORQ SI, SI
+
+col_loop_b0:
+	// Initialize (rr * ncols + cc) / 8, here ncols=256
+	MOVQ SI, DI
+	SHRQ $0x03, DI
+
+	// Construct eight XMM with first 4 bytes of first 32 rows
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X2
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X2
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X2
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X3
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X3
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X3
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X4
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X4
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X4
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X5
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X5
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X5
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X6
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X6
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X6
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X7
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X7
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X7
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X8
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X8
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X8
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x00, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x01, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x02, DX, X9
+	ADDQ   $0x20, DI
+	MOVL   (AX)(DI*1), DX
+	PINSRD $0x03, DX, X9
+	ADDQ   $0x20, DI
+	PSHUFB flip_mask<>+0(SB), X9
+
+	// Matrix transform 4x4
+	VPUNPCKHDQ  X3, X2, X1
+	VPUNPCKLDQ  X3, X2, X2
+	VPUNPCKLDQ  X5, X4, X0
+	VPUNPCKHDQ  X5, X4, X4
+	VPUNPCKHQDQ X0, X2, X3
+	VPUNPCKLQDQ X0, X2, X2
+	VPUNPCKHQDQ X4, X1, X5
+	VPUNPCKLQDQ X4, X1, X4
+	VPUNPCKHDQ  X7, X6, X1
+	VPUNPCKLDQ  X7, X6, X6
+	VPUNPCKLDQ  X9, X8, X0
+	VPUNPCKHDQ  X9, X8, X8
+	VPUNPCKHQDQ X0, X6, X7
+	VPUNPCKLQDQ X0, X6, X6
+	VPUNPCKHQDQ X8, X1, X9
+	VPUNPCKLQDQ X8, X1, X8
+	MOVOU       X2, X0
+	VINSERTI128 $0x01, X6, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x0c, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X3, X0
+	VINSERTI128 $0x01, X7, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x0c, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X4, X0
+	VINSERTI128 $0x01, X8, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x0c, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ        $0x10, DI
+	ADDQ        $0x08, SI
+	MOVOU       X5, X0
+	VINSERTI128 $0x01, X9, Y0, Y0
+
+	// Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128
+	MOVQ SI, DI
+	ADDQ $0x07, DI
+	SHLQ $0x04, DI
+	ADDQ $0x0c, DI
+
+	// Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ      $0x10, DI
+	VPMOVMSKB Y0, DX
+	MOVL      DX, (CX)(DI*1)
+	VPSLLQ    $0x01, Y0, Y0
+
+	// Sub nrows / 8
+	SUBQ $0x10, DI
+	ADDQ $0x08, SI
+
+	// Compare cc with ncols, here ncols=256
+	CMPQ SI, $0x00000100
+	JL   col_loop_b0
+	VZEROUPPER
+	RET
+
 // func xor32x128(x *byte, y *byte, out *byte)
 // Requires: SSE2
-TEXT ·xor32x128(SB), NOSPLIT, $0-24
-	MOVQ x+0(FP), AX
-	MOVQ y+8(FP), CX
-	MOVQ out+16(FP), DX
-	XORQ BX, BX
+TEXT ·xor32x128(SB), NOSPLIT, $0-24
+	MOVQ x+0(FP), AX
+	MOVQ y+8(FP), CX
+	MOVQ out+16(FP), DX
+	XORQ BX, BX
+
+xor32_loop:
+	MOVOU (AX)(BX*1), X0
+	MOVOU (CX)(BX*1), X1
+	PXOR  X0, X1
+	MOVOU X1, (DX)(BX*1)
+	ADDQ  $0x10, BX
+	CMPQ  BX, $0x00000200
+	JL    xor32_loop
+	RET
+
+// func xor32x128avx(len int, x *byte, y *byte, out *byte)
+// Requires: AVX, AVX2
+TEXT ·xor32x128avx(SB), NOSPLIT, $0-32
+	MOVQ x+8(FP), AX
+	MOVQ y+16(FP), CX
+	MOVQ out+24(FP), DX
+	MOVQ len+0(FP), BX
+	XORQ SI, SI
+
+xor32_loop_avx:
+	VMOVDQU (AX)(SI*1), Y0
+	VMOVDQU (CX)(SI*1), Y1
+	VPXOR   Y0, Y1, Y1
+	VMOVDQU Y1, (DX)(SI*1)
+	ADDQ    $0x20, SI
+	CMPQ    SI, BX
+	JL      xor32_loop_avx
+	VZEROUPPER
+	RET
+
+// func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte)
+// Requires: SSE2
+TEXT ·xorRoundKey128(SB), NOSPLIT, $0-40
+	MOVL    rk+0(FP), AX
+	MOVQ    x1+8(FP), CX
+	MOVQ    x2+16(FP), DX
+	MOVQ    x3+24(FP), BX
+	MOVQ    out+32(FP), SI
+	PCMPEQB X1, X1
+	XORQ    R8, R8
+
+	// Handle first byte
+	MOVL $0x01000000, DI
+
+rk_loop_1:
+	MOVOU (CX)(R8*1), X0
+	PXOR  (DX)(R8*1), X0
+	PXOR  (BX)(R8*1), X0
+	TESTL AX, DI
+	JZ    rk_loop_1_c
+	PXOR  X1, X0
+
+rk_loop_1_c:
+	MOVOU X0, (SI)(R8*1)
+	ROLL  $0x01, DI
+	ADDQ  $0x10, R8
+	CMPQ  R8, $0x00000080
+	JL    rk_loop_1
+
+	// Handle second byte
+	MOVL $0x00010000, DI
+
+rk_loop_2:
+	MOVOU (CX)(R8*1), X0
+	PXOR  (DX)(R8*1), X0
+	PXOR  (BX)(R8*1), X0
+	TESTL AX, DI
+	JZ    rk_loop_2_c
+	PXOR  X1, X0
+
+rk_loop_2_c:
+	MOVOU X0, (SI)(R8*1)
+	ROLL  $0x01, DI
+	ADDQ  $0x10, R8
+	CMPQ  R8, $0x00000100
+	JL    rk_loop_2
+
+	// Handle third byte
+	MOVL $0x00000100, DI
+
+rk_loop_3:
+	MOVOU (CX)(R8*1), X0
+	PXOR  (DX)(R8*1), X0
+	PXOR  (BX)(R8*1), X0
+	TESTL AX, DI
+	JZ    rk_loop_3_c
+	PXOR  X1, X0
+
+rk_loop_3_c:
+	MOVOU X0, (SI)(R8*1)
+	ROLL  $0x01, DI
+	ADDQ  $0x10, R8
+	CMPQ  R8, $0x00000180
+	JL    rk_loop_3
+
+	// Handle last byte
+	MOVL $0x00000001, DI
+
+rk_loop_4:
+	MOVOU (CX)(R8*1), X0
+	PXOR  (DX)(R8*1), X0
+	PXOR  (BX)(R8*1), X0
+	TESTL AX, DI
+	JZ    rk_loop_4_c
+	PXOR  X1, X0
+
+rk_loop_4_c:
+	MOVOU X0, (SI)(R8*1)
+	ROLL  $0x01, DI
+	ADDQ  $0x10, R8
+	CMPQ  R8, $0x00000200
+	JL    rk_loop_4
+	RET
+
+// func sbox128(x *byte, buffer *byte)
+// Requires: SSE2
+TEXT ·sbox128(SB), NOSPLIT, $0-16
+	MOVQ x+0(FP), AX
+	MOVQ buffer+8(FP), CX
+
+	// f, for not operation
+	PCMPEQB X0, X0
+
+	// Start input function
+	// t1=b7 ^ b5
+	MOVOU 112(AX), X1
+	PXOR  80(AX), X1
+	MOVOU 16(AX), X2
+	MOVOU X2, X3
+	MOVOU X2, X4
+
+	// store m6=b1
+	MOVOU X2, 224(CX)
+
+	// t2=b5 ^ b1
+	PXOR  80(AX), X2
+	PANDN X0, X2
+
+	// store g5=^b0
+	MOVOU (AX), X5
+	MOVOU X5, X6
+	PANDN X0, X6
+	MOVOU X6, 80(CX)
+
+	// t3=^(b0 ^ t2)
+	PXOR  X2, X5
+	PANDN X0, X5
+
+	// t4=b6 ^ b2
+	MOVOU 96(AX), X6
+	MOVOU X6, X7
+	PXOR  32(AX), X6
+
+	// t5=b3 ^ t3
+	MOVOU 48(AX), X8
+	MOVOU X8, X9
+	PXOR  X5, X8
+
+	// t6=b4 ^ t1
+	MOVOU 64(AX), X10
+	PXOR  X1, X10
+
+	// t7=b1 ^ t5
+	PXOR X8, X3
+
+	// t8=b1 ^ t4
+	PXOR X6, X4
+
+	// t9=t6 ^ t8
+	MOVOU X10, X11
+	PXOR  X4, X11
+
+	// store m8
+	MOVOU X11, 256(CX)
+
+	// store g1
+	MOVOU X3, 16(CX)
+
+	// store g3
+	MOVOU X8, 48(CX)
+
+	// store g4
+	MOVOU X2, 64(CX)
+
+	// store m0
+	MOVOU X10, 128(CX)
+
+	// store m1
+	MOVOU X5, 144(CX)
+
+	// store m2
+	MOVOU X4, 160(CX)
+
+	// store m4
+	MOVOU X6, 192(CX)
+
+	// t11=^(b3 ^ t1)
+	PXOR  X1, X9
+	PANDN X0, X9
+
+	// store m5, can reuse t1 now
+	MOVOU X9, 208(CX)
+
+	// t12=^(b6 ^ t9)
+	PXOR  X11, X7
+	PANDN X0, X7
+
+	// store m9, can reuse t7 t8 t9 now
+	MOVOU X7, 272(CX)
+
+	// t10=t6 ^ t7
+	PXOR X10, X3
+
+	// store g0, can reuse t6 now
+	MOVOU X3, (CX)
+
+	// t13=t4 ^ t10
+	PXOR X6, X3
+
+	// store g2, can reuse t4 now
+	MOVOU X3, 32(CX)
+
+	// t14=t2 ^ t11
+	MOVOU X9, X1
+	PXOR  X2, X1
+
+	// store g6, can reuse t2 now
+	MOVOU X1, 96(CX)
+
+	// t15=t12^t14
+	PXOR X7, X1
+
+	// store g7
+	MOVOU X1, 112(CX)
+
+	// t16=t3 ^ t12
+	PXOR X5, X7
+
+	// store m3
+	MOVOU X7, 176(CX)
+
+	// t17=t11 ^ t16
+	PXOR X9, X7
+
+	// store m7
+	MOVOU X7, 240(CX)
+
+	// Start top function
+	// Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8
+	// t2=m0 & m1
+	PAND X10, X5
+
+	// t3=g0 & g4
+	PAND (CX), X2
+
+	// t4=g3 & g7
+	MOVOU X1, X10
+	PAND  X8, X1
+
+	// t7=g3 | g7
+	POR X10, X8
+
+	// t11=m4 & m5
+	PAND  X6, X9
+	MOVOU 176(CX), X6
+	MOVOU X6, X10
+
+	// t10=m3 & m2
+	PAND X4, X10
+
+	// t12=m3 | m2
+	POR X4, X6
+
+	// t6=g6 | g2
+	POR 96(CX), X3
+
+	// t9=m6 | m7
+	POR   224(CX), X7
+	MOVOU 272(CX), X4
+	MOVOU X4, X12
+
+	// t5=m8 & m9
+	PAND X11, X4
+
+	// t8=m8 | m9
+	POR X11, X12
+
+	// t14 = t3 ^ t2
+	PXOR X5, X2
+
+	// t16 = t5 ^ t14
+	PXOR X2, X4
+
+	// t20 = t16 ^ t7
+	PXOR X4, X8
+
+	// t17 = t9 ^ t10
+	PXOR X7, X10
+
+	// t18 = t11 ^ t12
+	PXOR X9, X6
+
+	// p2 = t20 ^ t18
+	PXOR X8, X6
+
+	// p0 = t6 ^ t16
+	PXOR X3, X4
+
+	// t1 = g5 & g1
+	MOVOU 16(CX), X2
+	MOVOU 80(CX), X8
+	PAND  X2, X8
+
+	// t13 = t1 ^ t2
+	PXOR X8, X5
+
+	// t15 = t13 ^ t4
+	PXOR X1, X5
+
+	// t19 = t6 ^ t15
+	PXOR X5, X3
+
+	// p3 = t19 ^ t17
+	PXOR X10, X3
+
+	// p1 = t8 ^ t15
+	PXOR X12, X5
+
+	// start middle function
+	// current register status: t8=p0, t3=p1, t4=p2, t7=p0
+	// t0 = p1 & p2
+	MOVOU X5, X1
+	PAND  X6, X1
+
+	// t1 = p3 & p0
+	MOVOU X4, X2
+	PAND  X3, X2
+
+	// t2 = p0 & p2
+	MOVOU X6, X8
+	PAND  X4, X8
+
+	// t3 = p1 & p3
+	MOVOU X5, X10
+	PAND  X3, X10
+
+	// t4 = t0 & t2
+	MOVOU X1, X11
+	PAND  X8, X11
+
+	// t5 = t1 & t3
+	MOVOU X2, X12
+	PXOR  X10, X12
+
+	// t6 = t5 | p0
+	POR X12, X4
+
+	// t7 = t2 | p3
+	POR X8, X3
+
+	// t8 = t4 ^ t6
+	PXOR X11, X4
+
+	// t9 = t7 ^ t3
+	PXOR X3, X10
+
+	// t10 = t0 ^ t9
+	PXOR X1, X10
+
+	// t11 = p2 | t5
+	POR X12, X6
+
+	// l1 = t11 ^ t1
+	PXOR X6, X2
+
+	// t12 = p1 | t2
+	POR X8, X5
+
+	// l2 = t12 ^ t5
+	PXOR X12, X5
+
+	// start bottom function
+	// current register status: t6=l0, t2=l1, t3=l2, t8=l3
+	// k4 = l2 ^ l3
+	MOVOU X4, X8
+	PXOR  X5, X8
+
+	// k3 = l1 ^ l3
+	MOVOU X4, X6
+	PXOR  X2, X6
+
+	// k2 = l0 ^ l2
+	MOVOU X10, X3
+	PXOR  X5, X3
+
+	// k0 = l0 ^ l1
+	MOVOU X10, X1
+	PXOR  X2, X1
+
+	// k1 = k2 ^ k3
+	MOVOU X6, X11
+	PXOR  X3, X11
+
+	// e0=(m1 & k0)
+	MOVOU 144(CX), X12
+	PAND  X1, X12
+
+	// e1=(g5 & l1)
+	MOVOU 80(CX), X9
+	PAND  X2, X9
+
+	// r0=e0 ^ e1
+	PXOR X9, X12
+
+	// e2=(g4 & l0)
+	MOVOU 64(CX), X7
+	PAND  X10, X7
+
+	// r1=e2 ^ e1
+	PXOR X7, X9
+
+	// store r0 r1
+	MOVOU X12, 352(CX)
+	MOVOU X9, 368(CX)
+
+	// e3=(m7 & k3)
+	MOVOU 240(CX), X12
+	PAND  X6, X12
+
+	// e4=(m5 & k2)
+	MOVOU 208(CX), X9
+	PAND  X3, X9
+
+	// r2=e3 ^ e4
+	PXOR X9, X12
+
+	// e5=(m3 & k1)
+	MOVOU 176(CX), X7
+	PAND  X11, X7
+
+	// r3=e5 ^ e4
+	PXOR X7, X9
+
+	// store r2 r3
+	MOVOU X12, 384(CX)
+	MOVOU X9, 400(CX)
+
+	// e6=(m9 & k4)
+	MOVOU 272(CX), X12
+	PAND  X8, X12
+
+	// e7=(g7 & l3)
+	MOVOU 112(CX), X9
+	PAND  X4, X9
+
+	// r4=e7 ^ e6
+	PXOR X9, X12
+
+	// e8=(g6 & l2)
+	MOVOU 96(CX), X7
+	PAND  X5, X7
+
+	// r5=e8 ^ e6
+	PXOR X9, X7
+
+	// store r4
+	MOVOU X12, 416(CX)
+
+	// e9=(m0 & k0)
+	MOVOU 128(CX), X12
+	PAND  X1, X12
+
+	// e10=(g1 & l1)
+	MOVOU 16(CX), X1
+	PAND  X2, X1
+
+	// r6=e9 ^ e10
+	PXOR X1, X12
+
+	// e11=(g0 & l0)
+	MOVOU (CX), X9
+	PAND  X9, X10
+
+	// r7=e11 ^ e10
+	PXOR X10, X1
+
+	// e12=(m6 & k3)
+	MOVOU 224(CX), X2
+	PAND  X6, X2
+
+	// e13=(m4 & k2)
+	MOVOU 192(CX), X10
+	PAND  X3, X10
+
+	// r8=e12 ^ e13
+	PXOR X10, X2
+
+	// e14=(m2 & k1)
+	MOVOU 160(CX), X6
+	PAND  X11, X6
+
+	// r9=e14 ^ e13
+	PXOR X10, X6
+
+	// e15=(m8 & k4)
+	MOVOU 256(CX), X11
+	PAND  X11, X8
+
+	// e16=(g3 & l3)
+	MOVOU 48(CX), X11
+	PAND  X11, X4
+
+	// r10=e15 ^ e16
+	PXOR X4, X8
+
+	// e17=(g2 & l2)
+	MOVOU 32(CX), X9
+	PAND  X9, X5
+
+	// r11=e17 ^ e16
+	PXOR X4, X5
+
+	// start output function
+	// [t1]=r7 ^ r9
+	PXOR X1, X6
+
+	// [t2]=t1 ^ r1
+	MOVOU 368(CX), X10
+	PXOR  X6, X10
+
+	// [t3]=t2 ^ r3
+	MOVOU 400(CX), X3
+	MOVOU X3, X4
+	PXOR  X10, X3
+
+	// [t4]=r5 ^ r3
+	PXOR X7, X4
+
+	// [t5]=r4 ^ [t4]
+	MOVOU 416(CX), X11
+	MOVOU X11, X9
+	PXOR  X4, X11
+
+	// [t6]=r0 ^ r4
+	PXOR 352(CX), X9
+
+	// [t7]=r11 ^ r7
+	PXOR X5, X1
+
+	// [t8]=[t1] ^ [t4]
+	PXOR X6, X4
+
+	// store t8
+	MOVOU X4, 80(AX)
+
+	// [t9]=[t1] ^ [t6]
+	PXOR X9, X6
+
+	// store t9
+	MOVOU X6, 32(AX)
+
+	// [t10]=r2 ^ t5
+	PXOR 384(CX), X11
+
+	// [t11]=r10 ^ r8
+	PXOR X8, X2
+
+	// store t11
+	MOVOU X2, 48(AX)
+
+	// [t12]=^([t3] ^ [t11])
+	PXOR  X3, X2
+	PANDN X0, X2
+
+	// store t12
+	MOVOU X2, 16(AX)
+
+	// [t13]=[t10] ^ [t12]
+	PXOR X2, X11
+
+	// store t13
+	MOVOU X11, 96(AX)
+
+	// [t14]=^([t3] ^ [t7])
+	PXOR  X3, X1
+	PANDN X0, X1
+
+	// store t14
+	MOVOU X1, 64(AX)
+
+	// [t16]=[t6] ^ [t14]
+	PXOR X9, X1
+
+	// store t16
+	MOVOU X1, (AX)
+
+	// [t15]=^(r10 ^ r6)
+	PXOR  X12, X8
+	PANDN X0, X8
+
+	// store t15
+	MOVOU X8, 112(AX)
+	RET
+
+// func l128(x *byte, buffer *byte)
+// Requires: SSE2
+TEXT ·l128(SB), NOSPLIT, $0-16
+	MOVQ  x+0(FP), AX
+	MOVQ  buffer+8(FP), CX
+	MOVOU (AX), X0
+	MOVOU 128(AX), X1
+	MOVOU 256(AX), X2
+	MOVOU 384(AX), X3
+	MOVOU 288(AX), X5
+	MOVOU 352(AX), X6
+	MOVOU 416(AX), X7
+	MOVOU 480(AX), X8
+	MOVOU 32(AX), X9
+
+	// 0=0^24^14^22^30
+	MOVOU X0, X4
+	PXOR  X3, X4
+	PXOR  224(AX), X4
+	PXOR  X6, X4
+	PXOR  X8, X4
+	MOVOU X4, (CX)
+
+	// 2=0^2^26^8^16
+	MOVOU X0, X4
+	PXOR  X9, X4
+	PXOR  X7, X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	MOVOU X4, 32(CX)
+
+	// 8=0^8^22^30^6
+	MOVOU X0, X4
+	PXOR  X1, X4
+	PXOR  X6, X4
+	PXOR  X8, X4
+	PXOR  96(AX), X4
+	MOVOU X4, 128(CX)
+
+	// 18=0^18^10^16^24
+	MOVOU X0, X4
+	PXOR  X5, X4
+	PXOR  160(AX), X4
+	PXOR  X2, X4
+	PXOR  X3, X4
+	MOVOU X4, 288(CX)
+
+	// 26=0^26^18^24^8
+	PXOR  X1, X0
+	PXOR  X7, X0
+	PXOR  X5, X0
+	PXOR  X3, X0
+	MOVOU X0, 416(CX)
+
+	// 10=10^2^8^16^24
+	MOVOU X9, X4
+	PXOR  160(AX), X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	PXOR  X3, X4
+	MOVOU X4, 160(CX)
+	MOVOU 96(AX), X0
+	MOVOU 224(AX), X5
+
+	// 16=16^8^30^6^14
+	PXOR  X2, X1
+	PXOR  X8, X1
+	PXOR  X0, X1
+	PXOR  X5, X1
+	MOVOU X1, 256(CX)
+
+	// 24=24^16^6^14^22
+	PXOR  X3, X2
+	PXOR  X0, X2
+	PXOR  X5, X2
+	PXOR  X6, X2
+	MOVOU X2, 384(CX)
+	MOVOU 64(AX), X1
+	MOVOU 160(AX), X2
+	MOVOU 192(AX), X3
+
+	// 4=4^28^2^10^18
+	MOVOU X9, X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	PXOR  288(AX), X4
+	PXOR  448(AX), X4
+	MOVOU X4, 64(CX)
+
+	// 20=20^12^18^26^2
+	MOVOU X9, X4
+	PXOR  320(AX), X4
+	PXOR  X3, X4
+	PXOR  288(AX), X4
+	PXOR  X7, X4
+	MOVOU X4, 320(CX)
 
-xor32_loop:
-	MOVOU (AX)(BX*1), X0
-	MOVOU (CX)(BX*1), X1
+	// 28=28^20^26^2^10
+	PXOR  448(AX), X9
+	PXOR  320(AX), X9
+	PXOR  X7, X9
+	PXOR  X2, X9
+	MOVOU X9, 448(CX)
+	MOVOU 320(AX), X9
+
+	// 6=6^30^4^12^20
+	MOVOU X1, X4
+	PXOR  X0, X4
+	PXOR  X3, X4
+	PXOR  X8, X4
+	PXOR  X9, X4
+	MOVOU X4, 96(CX)
+
+	// 12=12^4^10^18^26
+	MOVOU X1, X4
+	PXOR  X3, X4
+	PXOR  X2, X4
+	PXOR  288(AX), X4
+	PXOR  X7, X4
+	MOVOU X4, 192(CX)
+	MOVOU 448(AX), X7
+
+	// 22=22^14^20^28^4
+	MOVOU X1, X4
+	PXOR  X5, X4
+	PXOR  X6, X4
+	PXOR  X9, X4
+	PXOR  X7, X4
+	MOVOU X4, 352(CX)
+
+	// 30=30^22^28^4^12
+	PXOR  X8, X1
+	PXOR  X6, X1
+	PXOR  X3, X1
+	PXOR  X7, X1
+	MOVOU X1, 480(CX)
+
+	// 14=14^6^12^20^28
+	PXOR  X3, X0
+	PXOR  X7, X0
+	PXOR  X9, X0
+	PXOR  X5, X0
+	MOVOU X0, 224(CX)
+	MOVOU 16(AX), X0
+	MOVOU 144(AX), X1
+	MOVOU 272(AX), X2
+	MOVOU 400(AX), X3
+	MOVOU 304(AX), X5
+	MOVOU 368(AX), X6
+	MOVOU 432(AX), X7
+	MOVOU 496(AX), X8
+	MOVOU 48(AX), X9
+
+	// 1=1^25^15^23^31
+	MOVOU X0, X4
+	PXOR  X3, X4
+	PXOR  240(AX), X4
+	PXOR  X6, X4
+	PXOR  X8, X4
+	MOVOU X4, 16(CX)
+
+	// 3=3^27^1^9^17
+	MOVOU X0, X4
+	PXOR  X9, X4
+	PXOR  X7, X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	MOVOU X4, 48(CX)
+
+	// 9=9^1^23^31^7
+	MOVOU X0, X4
+	PXOR  X1, X4
+	PXOR  X6, X4
+	PXOR  X8, X4
+	PXOR  112(AX), X4
+	MOVOU X4, 144(CX)
+
+	// 19=1^19^11^17^25
+	MOVOU X0, X4
+	PXOR  X5, X4
+	PXOR  176(AX), X4
+	PXOR  X2, X4
+	PXOR  X3, X4
+	MOVOU X4, 304(CX)
+
+	// 27=1^27^19^25^9
+	PXOR  X1, X0
+	PXOR  X7, X0
+	PXOR  X5, X0
+	PXOR  X3, X0
+	MOVOU X0, 432(CX)
+
+	// 11=11^3^9^17^25
+	MOVOU X9, X4
+	PXOR  176(AX), X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	PXOR  X3, X4
+	MOVOU X4, 176(CX)
+	MOVOU 112(AX), X0
+	MOVOU 240(AX), X5
+
+	// 17=17^9^31^7^15
+	PXOR  X2, X1
+	PXOR  X8, X1
 	PXOR  X0, X1
-	MOVOU X1, (DX)(BX*1)
-	ADDQ  $0x10, BX
-	CMPQ  BX, $0x00000200
-	JL    xor32_loop
+	PXOR  X5, X1
+	MOVOU X1, 272(CX)
+
+	// 25=25^17^7^15^23
+	PXOR  X3, X2
+	PXOR  X0, X2
+	PXOR  X5, X2
+	PXOR  X6, X2
+	MOVOU X2, 400(CX)
+	MOVOU 80(AX), X1
+	MOVOU 176(AX), X2
+	MOVOU 208(AX), X3
+
+	// 5=5^29^3^11^19
+	MOVOU X9, X4
+	PXOR  X1, X4
+	PXOR  X2, X4
+	PXOR  304(AX), X4
+	PXOR  464(AX), X4
+	MOVOU X4, 80(CX)
+
+	// 21=21^13^19^27^3
+	MOVOU X9, X4
+	PXOR  336(AX), X4
+	PXOR  X3, X4
+	PXOR  304(AX), X4
+	PXOR  X7, X4
+	MOVOU X4, 336(CX)
+
+	// 29=29^21^27^3^11
+	PXOR  464(AX), X9
+	PXOR  336(AX), X9
+	PXOR  X7, X9
+	PXOR  X2, X9
+	MOVOU X9, 464(CX)
+	MOVOU 336(AX), X9
+
+	// 7=7^31^5^13^21
+	MOVOU X1, X4
+	PXOR  X0, X4
+	PXOR  X3, X4
+	PXOR  X8, X4
+	PXOR  X9, X4
+	MOVOU X4, 112(CX)
+
+	// 13=13^5^11^19^27
+	MOVOU X1, X4
+	PXOR  X3, X4
+	PXOR  X2, X4
+	PXOR  304(AX), X4
+	PXOR  X7, X4
+	MOVOU X4, 208(CX)
+	MOVOU 464(AX), X7
+
+	// 23=23^15^21^29^5
+	MOVOU X1, X4
+	PXOR  X5, X4
+	PXOR  X6, X4
+	PXOR  X9, X4
+	PXOR  X7, X4
+	MOVOU X4, 368(CX)
+
+	// 31=31^23^29^5^13
+	PXOR  X8, X1
+	PXOR  X6, X1
+	PXOR  X3, X1
+	PXOR  X7, X1
+	MOVOU X1, 496(CX)
+
+	// 15=15^7^13^21^29
+	PXOR  X3, X0
+	PXOR  X7, X0
+	PXOR  X9, X0
+	PXOR  X5, X0
+	MOVOU X0, 240(CX)
 	RET
 
-// func xor32x128avx(x *byte, y *byte, out *byte)
+// func l256(x *byte, buffer *byte)
 // Requires: AVX, AVX2
-TEXT ·xor32x128avx(SB), NOSPLIT, $0-24
-	MOVQ x+0(FP), AX
-	MOVQ y+8(FP), CX
-	MOVQ out+16(FP), DX
-	XORQ BX, BX
+TEXT ·l256(SB), NOSPLIT, $0-16
+	MOVQ    x+0(FP), AX
+	MOVQ    buffer+8(FP), CX
+	VMOVDQU (AX), Y0
+	VMOVDQU 256(AX), Y1
+	VMOVDQU 512(AX), Y2
+	VMOVDQU 768(AX), Y3
+	VMOVDQU 576(AX), Y5
+	VMOVDQU 704(AX), Y6
+	VMOVDQU 832(AX), Y7
+	VMOVDQU 960(AX), Y8
+	VMOVDQU 64(AX), Y9
 
-xor32_loop_avx:
-	VMOVDQU (AX)(BX*1), Y0
-	VMOVDQU (CX)(BX*1), Y1
+	// 0=0^24^14^22^30
+	VPXOR   Y3, Y0, Y4
+	VPXOR   448(AX), Y4, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VMOVDQU Y4, (CX)
+
+	// 2=0^2^26^8^16
+	VPXOR   Y9, Y0, Y4
+	VPXOR   Y7, Y4, Y4
+	VPXOR   Y1, Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VMOVDQU Y4, 64(CX)
+
+	// 8=0^8^22^30^6
+	VPXOR   Y1, Y0, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VPXOR   192(AX), Y4, Y4
+	VMOVDQU Y4, 256(CX)
+
+	// 18=0^18^10^16^24
+	VPXOR   Y5, Y0, Y4
+	VPXOR   320(AX), Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   Y3, Y4, Y4
+	VMOVDQU Y4, 576(CX)
+
+	// 26=0^26^18^24^8
+	VPXOR   Y1, Y0, Y0
+	VPXOR   Y7, Y0, Y0
+	VPXOR   Y5, Y0, Y0
+	VPXOR   Y3, Y0, Y0
+	VMOVDQU Y0, 832(CX)
+
+	// 10=10^2^8^16^24
+	VPXOR   320(AX), Y9, Y4
+	VPXOR   Y1, Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   Y3, Y4, Y4
+	VMOVDQU Y4, 320(CX)
+	VMOVDQU 192(AX), Y0
+	VMOVDQU 448(AX), Y5
+
+	// 16=16^8^30^6^14
+	VPXOR   Y2, Y1, Y1
+	VPXOR   Y8, Y1, Y1
 	VPXOR   Y0, Y1, Y1
-	VMOVDQU Y1, (DX)(BX*1)
-	ADDQ    $0x20, BX
-	CMPQ    BX, $0x00000200
-	JL      xor32_loop_avx
-	VZEROUPPER
-	RET
+	VPXOR   Y5, Y1, Y1
+	VMOVDQU Y1, 512(CX)
 
-// func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte)
-// Requires: SSE2
-TEXT ·xorRoundKey128(SB), NOSPLIT, $0-40
-	MOVL    rk+0(FP), AX
-	MOVQ    x1+8(FP), CX
-	MOVQ    x2+16(FP), DX
-	MOVQ    x3+24(FP), BX
-	MOVQ    out+32(FP), SI
-	PCMPEQB X1, X1
-	XORQ    R8, R8
+	// 24=24^16^6^14^22
+	VPXOR   Y3, Y2, Y2
+	VPXOR   Y0, Y2, Y2
+	VPXOR   Y5, Y2, Y2
+	VPXOR   Y6, Y2, Y2
+	VMOVDQU Y2, 768(CX)
+	VMOVDQU 128(AX), Y1
+	VMOVDQU 320(AX), Y2
+	VMOVDQU 384(AX), Y3
 
-	// Handle first byte
-	MOVL $0x01000000, DI
+	// 4=4^28^2^10^18
+	VPXOR   Y1, Y9, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   576(AX), Y4, Y4
+	VPXOR   896(AX), Y4, Y4
+	VMOVDQU Y4, 128(CX)
 
-rk_loop_1:
-	MOVOU (CX)(R8*1), X0
-	PXOR  (DX)(R8*1), X0
-	PXOR  (BX)(R8*1), X0
-	TESTL AX, DI
-	JZ    rk_loop_1_c
-	PXOR  X1, X0
+	// 20=20^12^18^26^2
+	VPXOR   640(AX), Y9, Y4
+	VPXOR   Y3, Y4, Y4
+	VPXOR   576(AX), Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 640(CX)
+
+	// 28=28^20^26^2^10
+	VPXOR   896(AX), Y9, Y9
+	VPXOR   640(AX), Y9, Y9
+	VPXOR   Y7, Y9, Y9
+	VPXOR   Y2, Y9, Y9
+	VMOVDQU Y9, 896(CX)
+	VMOVDQU 640(AX), Y9
+
+	// 6=6^30^4^12^20
+	VPXOR   Y0, Y1, Y4
+	VPXOR   Y3, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VPXOR   Y9, Y4, Y4
+	VMOVDQU Y4, 192(CX)
+
+	// 12=12^4^10^18^26
+	VPXOR   Y3, Y1, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   576(AX), Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 384(CX)
+	VMOVDQU 896(AX), Y7
+
+	// 22=22^14^20^28^4
+	VPXOR   Y5, Y1, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y9, Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 704(CX)
+
+	// 30=30^22^28^4^12
+	VPXOR   Y8, Y1, Y1
+	VPXOR   Y6, Y1, Y1
+	VPXOR   Y3, Y1, Y1
+	VPXOR   Y7, Y1, Y1
+	VMOVDQU Y1, 960(CX)
+
+	// 14=14^6^12^20^28
+	VPXOR   Y3, Y0, Y0
+	VPXOR   Y7, Y0, Y0
+	VPXOR   Y9, Y0, Y0
+	VPXOR   Y5, Y0, Y0
+	VMOVDQU Y0, 448(CX)
+	VMOVDQU 32(AX), Y0
+	VMOVDQU 288(AX), Y1
+	VMOVDQU 544(AX), Y2
+	VMOVDQU 800(AX), Y3
+	VMOVDQU 608(AX), Y5
+	VMOVDQU 736(AX), Y6
+	VMOVDQU 864(AX), Y7
+	VMOVDQU 992(AX), Y8
+	VMOVDQU 96(AX), Y9
+
+	// 1=1^25^15^23^31
+	VPXOR   Y3, Y0, Y4
+	VPXOR   480(AX), Y4, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VMOVDQU Y4, 32(CX)
+
+	// 3=3^27^1^9^17
+	VPXOR   Y9, Y0, Y4
+	VPXOR   Y7, Y4, Y4
+	VPXOR   Y1, Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VMOVDQU Y4, 96(CX)
+
+	// 9=9^1^23^31^7
+	VPXOR   Y1, Y0, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VPXOR   224(AX), Y4, Y4
+	VMOVDQU Y4, 288(CX)
+
+	// 19=1^19^11^17^25
+	VPXOR   Y5, Y0, Y4
+	VPXOR   352(AX), Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   Y3, Y4, Y4
+	VMOVDQU Y4, 608(CX)
+
+	// 27=1^27^19^25^9
+	VPXOR   Y1, Y0, Y0
+	VPXOR   Y7, Y0, Y0
+	VPXOR   Y5, Y0, Y0
+	VPXOR   Y3, Y0, Y0
+	VMOVDQU Y0, 864(CX)
 
-rk_loop_1_c:
-	MOVOU X0, (SI)(R8*1)
-	ROLL  $0x01, DI
-	ADDQ  $0x10, R8
-	CMPQ  R8, $0x00000080
-	JL    rk_loop_1
+	// 11=11^3^9^17^25
+	VPXOR   352(AX), Y9, Y4
+	VPXOR   Y1, Y4, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   Y3, Y4, Y4
+	VMOVDQU Y4, 352(CX)
+	VMOVDQU 224(AX), Y0
+	VMOVDQU 480(AX), Y5
 
-	// Handle second byte
-	MOVL $0x00010000, DI
+	// 17=17^9^31^7^15
+	VPXOR   Y2, Y1, Y1
+	VPXOR   Y8, Y1, Y1
+	VPXOR   Y0, Y1, Y1
+	VPXOR   Y5, Y1, Y1
+	VMOVDQU Y1, 544(CX)
 
-rk_loop_2:
-	MOVOU (CX)(R8*1), X0
-	PXOR  (DX)(R8*1), X0
-	PXOR  (BX)(R8*1), X0
-	TESTL AX, DI
-	JZ    rk_loop_2_c
-	PXOR  X1, X0
+	// 25=25^17^7^15^23
+	VPXOR   Y3, Y2, Y2
+	VPXOR   Y0, Y2, Y2
+	VPXOR   Y5, Y2, Y2
+	VPXOR   Y6, Y2, Y2
+	VMOVDQU Y2, 800(CX)
+	VMOVDQU 160(AX), Y1
+	VMOVDQU 352(AX), Y2
+	VMOVDQU 416(AX), Y3
 
-rk_loop_2_c:
-	MOVOU X0, (SI)(R8*1)
-	ROLL  $0x01, DI
-	ADDQ  $0x10, R8
-	CMPQ  R8, $0x00000100
-	JL    rk_loop_2
+	// 5=5^29^3^11^19
+	VPXOR   Y1, Y9, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   608(AX), Y4, Y4
+	VPXOR   928(AX), Y4, Y4
+	VMOVDQU Y4, 160(CX)
 
-	// Handle third byte
-	MOVL $0x00000100, DI
+	// 21=21^13^19^27^3
+	VPXOR   672(AX), Y9, Y4
+	VPXOR   Y3, Y4, Y4
+	VPXOR   608(AX), Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 672(CX)
 
-rk_loop_3:
-	MOVOU (CX)(R8*1), X0
-	PXOR  (DX)(R8*1), X0
-	PXOR  (BX)(R8*1), X0
-	TESTL AX, DI
-	JZ    rk_loop_3_c
-	PXOR  X1, X0
+	// 29=29^21^27^3^11
+	VPXOR   928(AX), Y9, Y9
+	VPXOR   672(AX), Y9, Y9
+	VPXOR   Y7, Y9, Y9
+	VPXOR   Y2, Y9, Y9
+	VMOVDQU Y9, 928(CX)
+	VMOVDQU 672(AX), Y9
 
-rk_loop_3_c:
-	MOVOU X0, (SI)(R8*1)
-	ROLL  $0x01, DI
-	ADDQ  $0x10, R8
-	CMPQ  R8, $0x00000180
-	JL    rk_loop_3
+	// 7=7^31^5^13^21
+	VPXOR   Y0, Y1, Y4
+	VPXOR   Y3, Y4, Y4
+	VPXOR   Y8, Y4, Y4
+	VPXOR   Y9, Y4, Y4
+	VMOVDQU Y4, 224(CX)
 
-	// Handle last byte
-	MOVL $0x00000001, DI
+	// 13=13^5^11^19^27
+	VPXOR   Y3, Y1, Y4
+	VPXOR   Y2, Y4, Y4
+	VPXOR   608(AX), Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 416(CX)
+	VMOVDQU 928(AX), Y7
 
-rk_loop_4:
-	MOVOU (CX)(R8*1), X0
-	PXOR  (DX)(R8*1), X0
-	PXOR  (BX)(R8*1), X0
-	TESTL AX, DI
-	JZ    rk_loop_4_c
-	PXOR  X1, X0
+	// 23=23^15^21^29^5
+	VPXOR   Y5, Y1, Y4
+	VPXOR   Y6, Y4, Y4
+	VPXOR   Y9, Y4, Y4
+	VPXOR   Y7, Y4, Y4
+	VMOVDQU Y4, 736(CX)
 
-rk_loop_4_c:
-	MOVOU X0, (SI)(R8*1)
-	ROLL  $0x01, DI
-	ADDQ  $0x10, R8
-	CMPQ  R8, $0x00000200
-	JL    rk_loop_4
+	// 31=31^23^29^5^13
+	VPXOR   Y8, Y1, Y1
+	VPXOR   Y6, Y1, Y1
+	VPXOR   Y3, Y1, Y1
+	VPXOR   Y7, Y1, Y1
+	VMOVDQU Y1, 992(CX)
+
+	// 15=15^7^13^21^29
+	VPXOR   Y3, Y0, Y0
+	VPXOR   Y7, Y0, Y0
+	VPXOR   Y9, Y0, Y0
+	VPXOR   Y5, Y0, Y0
+	VMOVDQU Y0, 480(CX)
+	VZEROUPPER
 	RET
 
-// func sbox128(x *byte, buffer *byte)
-// Requires: SSE2
-TEXT ·sbox128(SB), NOSPLIT, $0-16
+// func sbox256avx2(x *byte, buffer *byte)
+// Requires: AVX, AVX2
+TEXT ·sbox256avx2(SB), NOSPLIT, $0-16
 	MOVQ x+0(FP), AX
 	MOVQ buffer+8(FP), CX
 
 	// f, for not operation
-	PCMPEQB X0, X0
+	VPCMPEQB Y0, Y0, Y0
 
 	// Start input function
 	// t1=b7 ^ b5
-	MOVOU 112(AX), X1
-	PXOR  80(AX), X1
-	MOVOU 16(AX), X2
-	MOVOU X2, X3
-	MOVOU X2, X4
+	VMOVDQU 224(AX), Y1
+	VPXOR   160(AX), Y1, Y1
+	VMOVDQU 32(AX), Y4
 
 	// store m6=b1
-	MOVOU X2, 224(CX)
+	VMOVDQU Y4, 448(CX)
 
 	// t2=b5 ^ b1
-	PXOR  80(AX), X2
-	PANDN X0, X2
-
-	// store g5=^b0
-	MOVOU (AX), X5
-	MOVOU X5, X6
-	PANDN X0, X6
-	MOVOU X6, 80(CX)
+	VPXOR   160(AX), Y4, Y2
+	VPANDN  Y0, Y2, Y2
+	VMOVDQU (AX), Y6
 
 	// t3=^(b0 ^ t2)
-	PXOR  X2, X5
-	PANDN X0, X5
+	VPXOR  Y2, Y6, Y5
+	VPANDN Y0, Y5, Y5
+
+	// store g5=^b0
+	VPANDN  Y0, Y6, Y6
+	VMOVDQU Y6, 160(CX)
 
 	// t4=b6 ^ b2
-	MOVOU 96(AX), X6
-	MOVOU X6, X7
-	PXOR  32(AX), X6
+	VMOVDQU 192(AX), Y7
+	VPXOR   64(AX), Y7, Y6
 
 	// t5=b3 ^ t3
-	MOVOU 48(AX), X8
-	MOVOU X8, X9
-	PXOR  X5, X8
+	VMOVDQU 96(AX), Y9
+	VPXOR   Y5, Y9, Y8
 
 	// t6=b4 ^ t1
-	MOVOU 64(AX), X10
-	PXOR  X1, X10
+	VPXOR 128(AX), Y1, Y10
 
 	// t7=b1 ^ t5
-	PXOR X8, X3
+	VPXOR Y8, Y4, Y3
 
 	// t8=b1 ^ t4
-	PXOR X6, X4
+	VPXOR Y6, Y4, Y4
 
 	// t9=t6 ^ t8
-	MOVOU X10, X11
-	PXOR  X4, X11
+	VPXOR Y4, Y10, Y11
 
 	// store m8
-	MOVOU X11, 256(CX)
+	VMOVDQU Y11, 512(CX)
 
 	// store g1
-	MOVOU X3, 16(CX)
+	VMOVDQU Y3, 32(CX)
 
 	// store g3
-	MOVOU X8, 48(CX)
+	VMOVDQU Y8, 96(CX)
 
 	// store g4
-	MOVOU X2, 64(CX)
+	VMOVDQU Y2, 128(CX)
 
 	// store m0
-	MOVOU X10, 128(CX)
+	VMOVDQU Y10, 256(CX)
 
 	// store m1
-	MOVOU X5, 144(CX)
+	VMOVDQU Y5, 288(CX)
 
 	// store m2
-	MOVOU X4, 160(CX)
+	VMOVDQU Y4, 320(CX)
 
 	// store m4
-	MOVOU X6, 192(CX)
+	VMOVDQU Y6, 384(CX)
 
 	// t11=^(b3 ^ t1)
-	PXOR  X1, X9
-	PANDN X0, X9
+	VPXOR  Y1, Y9, Y9
+	VPANDN Y0, Y9, Y9
 
 	// store m5, can reuse t1 now
-	MOVOU X9, 208(CX)
+	VMOVDQU Y9, 416(CX)
 
 	// t12=^(b6 ^ t9)
-	PXOR  X11, X7
-	PANDN X0, X7
+	VPXOR  Y11, Y7, Y7
+	VPANDN Y0, Y7, Y7
 
 	// store m9, can reuse t7 t8 t9 now
-	MOVOU X7, 272(CX)
+	VMOVDQU Y7, 544(CX)
 
 	// t10=t6 ^ t7
-	PXOR X10, X3
+	VPXOR Y10, Y3, Y3
 
 	// store g0, can reuse t6 now
-	MOVOU X3, (CX)
+	VMOVDQU Y3, (CX)
 
 	// t13=t4 ^ t10
-	PXOR X6, X3
+	VPXOR Y6, Y3, Y3
 
 	// store g2, can reuse t4 now
-	MOVOU X3, 32(CX)
+	VMOVDQU Y3, 64(CX)
 
 	// t14=t2 ^ t11
-	MOVOU X9, X1
-	PXOR  X2, X1
+	VPXOR Y2, Y9, Y1
 
 	// store g6, can reuse t2 now
-	MOVOU X1, 96(CX)
+	VMOVDQU Y1, 192(CX)
 
 	// t15=t12^t14
-	PXOR X7, X1
+	VPXOR Y7, Y1, Y1
 
 	// store g7
-	MOVOU X1, 112(CX)
+	VMOVDQU Y1, 224(CX)
 
 	// t16=t3 ^ t12
-	PXOR X5, X7
+	VPXOR Y5, Y7, Y7
 
 	// store m3
-	MOVOU X7, 176(CX)
+	VMOVDQU Y7, 352(CX)
 
 	// t17=t11 ^ t16
-	PXOR X9, X7
+	VPXOR Y9, Y7, Y7
 
 	// store m7
-	MOVOU X7, 240(CX)
+	VMOVDQU Y7, 480(CX)
 
 	// Start top function
 	// Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8
-	// t2=^(m0 & m1)
-	PAND  X10, X5
-	PANDN X0, X5
+	// t2= (m0 & m1)
+	VPAND Y10, Y5, Y5
 
-	// t3=^(g0 & g4)
-	PAND  (CX), X2
-	PANDN X0, X2
+	// t3= (g0 & g4)
+	VPAND (CX), Y2, Y2
 
-	// t4=^(g3 & g7)
-	MOVOU X1, X10
-	PAND  X8, X1
-	PANDN X0, X1
+	// t4= (g3 & g7)
+	VPAND Y8, Y1, Y10
 
-	// t7=^(g3 | g7)
-	POR   X10, X8
-	PANDN X0, X8
+	// t7= (g3 | g7)
+	VPOR Y1, Y8, Y8
 
-	// t11=^(m4 & m5)
-	PAND  X6, X9
-	PANDN X0, X9
-	MOVOU 176(CX), X6
-	MOVOU X6, X10
+	// t11= (m4 & m5)
+	VPAND   Y6, Y9, Y9
+	VMOVDQU 352(CX), Y6
 
-	// t10=^( m3 & m2 )
-	PAND  X4, X10
-	PANDN X0, X10
+	// t10= ( m3 & m2 )
+	VPAND Y4, Y6, Y1
 
-	// t12=^( m3 | m2 )
-	POR   X4, X6
-	PANDN X0, X6
+	// t12= ( m3 | m2 )
+	VPOR Y4, Y6, Y6
 
-	// t6=^( g6 | g2 )
-	POR   96(CX), X3
-	PANDN X0, X3
+	// t6= ( g6 | g2 )
+	VPOR 192(CX), Y3, Y3
 
-	// t9=^( m6 | m7 )
-	POR   224(CX), X7
-	PANDN X0, X7
-	MOVOU 272(CX), X4
-	MOVOU X4, X12
+	// t9= ( m6 | m7 )
+	VPOR    448(CX), Y7, Y7
+	VMOVDQU 544(CX), Y12
 
-	// t5=^( m8 & m9 )
-	PAND  X11, X4
-	PANDN X0, X4
+	// t5= ( m8 & m9 )
+	VPAND Y11, Y12, Y4
 
-	// t8=^( m8 | m9 )
-	POR   X11, X12
-	PANDN X0, X12
+	// t8= ( m8 | m9 )
+	VPOR Y11, Y12, Y12
 
 	// t14 = t3 ^ t2
-	PXOR X5, X2
+	VPXOR Y5, Y2, Y2
 
 	// t16 = t5 ^ t14
-	PXOR X2, X4
+	VPXOR Y2, Y4, Y4
 
 	// t20 = t16 ^ t7
-	PXOR X4, X8
+	VPXOR Y4, Y8, Y8
 
 	// t17 = t9 ^ t10
-	PXOR X7, X10
+	VPXOR Y7, Y1, Y1
 
 	// t18 = t11 ^ t12
-	PXOR X9, X6
+	VPXOR Y9, Y6, Y6
 
 	// p2 = t20 ^ t18
-	PXOR X8, X6
+	VPXOR Y8, Y6, Y6
 
 	// p0 = t6 ^ t16
-	PXOR X3, X4
+	VPXOR Y3, Y4, Y4
 
-	// t1 = ^(g5 & g1)
-	MOVOU 16(CX), X2
-	MOVOU 80(CX), X8
-	PAND  X2, X8
-	PANDN X0, X8
+	// t1 = (g5 & g1)
+	VMOVDQU 32(CX), Y2
+	VMOVDQU 160(CX), Y8
+	VPAND   Y2, Y8, Y8
 
 	// t13 = t1 ^ t2
-	PXOR X8, X5
+	VPXOR Y8, Y5, Y5
 
 	// t15 = t13 ^ t4
-	PXOR X1, X5
+	VPXOR Y10, Y5, Y5
 
 	// t19 = t6 ^ t15
-	PXOR X5, X3
+	VPXOR Y5, Y3, Y3
 
 	// p3 = t19 ^ t17
-	PXOR X10, X3
+	VPXOR Y1, Y3, Y3
 
 	// p1 = t8 ^ t15
-	PXOR X12, X5
+	VPXOR Y12, Y5, Y5
 
 	// start middle function
 	// current register status: t8=p0, t3=p1, t4=p2, t7=p0
-	// t1 = ^(p3 & p0)
-	MOVOU X4, X1
-	PAND  X3, X1
-	PANDN X0, X1
+	// t0 = (p1 & p2)
+	VPAND Y5, Y6, Y1
 
-	// t2 = ^(t1 | p2)
-	MOVOU X6, X2
-	POR   X1, X2
-	PANDN X0, X2
+	// t1 = (p3 & p0)
+	VPAND Y3, Y4, Y2
 
-	// t3 = ^(p2 & p0)
-	MOVOU X6, X8
-	PAND  X4, X6
-	PANDN X0, X6
+	// t2 = (p0 & p2)
+	VPAND Y6, Y4, Y8
 
-	// t4 = p1 ^ t3
-	PXOR X5, X6
+	// t3 = (p1 & p3)
+	VPAND Y5, Y3, Y10
 
-	// t5 = ^(p2 | t4)
-	MOVOU X8, X11
-	POR   X6, X8
-	PANDN X0, X8
+	// t4 = (t0 & t2)
+	VPAND Y1, Y8, Y11
 
-	// t6 = ^(p1 & t4)
-	MOVOU X5, X10
-	PAND  X6, X10
-	PANDN X0, X10
+	// t5 = (t1 ^ t3)
+	VPXOR Y2, Y10, Y12
 
-	// t7 = ^(p3 | t4)
-	MOVOU X3, X9
-	POR   X6, X3
-	PANDN X0, X3
+	// t6 = (t5 | p0)
+	VPOR Y12, Y4, Y4
 
-	// t8 = ^(t7 | t2)
-	MOVOU X4, X7
-	MOVOU X3, X4
-	POR   X2, X4
-	PANDN X0, X4
+	// t7 = (t2 | p3)
+	VPOR Y8, Y3, Y3
 
-	// t9 = ^(t7 ^ t5)
-	PXOR  X8, X3
-	PANDN X0, X3
+	// t8 = (t4 ^ t6)
+	VPXOR Y11, Y4, Y4
 
-	// t10 = ^(t9 ^ p3)
-	PXOR  X3, X9
-	PANDN X0, X9
+	// t9 = (t7 ^ t3)
+	VPXOR Y10, Y3, Y11
 
-	// t11 = ^(t6 & t8)
-	PAND  X4, X10
-	PANDN X0, X10
+	// t10 = (t0 ^ t9)
+	VPXOR Y1, Y11, Y1
 
-	// t12 = ^(p1 & t8)
-	PAND  X4, X5
-	PANDN X0, X5
+	// t11 = (t5 | p2)
+	VPOR Y6, Y12, Y6
 
-	// t13 = ^(t12 ^ p0)
-	PXOR  X5, X7
-	PANDN X0, X7
+	// l1 = t11 ^ t1
+	VPXOR Y2, Y6, Y2
 
-	// t14 = ^(t1 & p2)
-	PAND  X1, X11
-	PANDN X0, X11
+	// t12 = (t2 | p1)
+	VPOR Y8, Y5, Y5
 
-	// t15 = ^(t14 & t9)
-	PAND  X11, X3
-	PANDN X0, X3
+	// l2 = t12 ^ t5
+	VPXOR Y5, Y12, Y5
 
 	// start bottom function
-	// current register status: t11=l0, t7=l1, t6=l2, t12=l3
+	// current register status: t1=l0, t2=l1, t3=l2, t8=l3
 	// k4 = l2 ^ l3
-	MOVOU X7, X8
-	PXOR  X10, X8
+	VPXOR Y5, Y4, Y3
 
 	// k3 = l1 ^ l3
-	MOVOU X7, X6
-	PXOR  X3, X6
+	VPXOR Y4, Y2, Y10
 
 	// k2 = l0 ^ l2
-	MOVOU X10, X5
-	PXOR  X9, X5
+	VPXOR Y1, Y5, Y8
 
 	// k0 = l0 ^ l1
-	MOVOU X3, X1
-	PXOR  X9, X1
+	VPXOR Y1, Y2, Y6
 
 	// k1 = k2 ^ k3
-	MOVOU X6, X2
-	PXOR  X5, X2
+	VPXOR Y8, Y10, Y11
 
-	// e0=^(m1 & k0)
-	MOVOU 144(CX), X4
-	PAND  X1, X4
-	PANDN X0, X4
+	// e0= (m1 & k0)
+	VMOVDQU 288(CX), Y12
+	VPAND   Y6, Y12, Y12
 
-	// e1=^(g5 & l1)
-	MOVOU 80(CX), X11
-	PAND  X3, X11
-	PANDN X0, X11
+	// e1= (g5 & l1)
+	VMOVDQU 160(CX), Y9
+	VPAND   Y2, Y9, Y9
 
 	// r0=e0 ^ e1
-	PXOR X11, X4
+	VPXOR Y9, Y12, Y12
 
-	// e2=^(g4 & l0)
-	MOVOU 64(CX), X12
-	PAND  X9, X12
-	PANDN X0, X12
+	// e2=(g4 & l0)
+	VMOVDQU 128(CX), Y7
+	VPAND   Y1, Y7, Y7
 
 	// r1=e2 ^ e1
-	PXOR X12, X11
+	VPXOR Y7, Y9, Y9
 
 	// store r0 r1
-	MOVOU X4, 352(CX)
-	MOVOU X11, 368(CX)
+	VMOVDQU Y12, 704(CX)
+	VMOVDQU Y9, 736(CX)
 
-	// e3=^(m7 & k3)
-	MOVOU 240(CX), X4
-	PAND  X6, X4
-	PANDN X0, X4
+	// e3= (m7 & k3)
+	VMOVDQU 480(CX), Y12
+	VPAND   Y10, Y12, Y12
 
-	// e4=^(m5 & k2)
-	MOVOU 208(CX), X11
-	PAND  X5, X11
-	PANDN X0, X11
+	// e4= (m5 & k2)
+	VMOVDQU 416(CX), Y9
+	VPAND   Y8, Y9, Y9
 
 	// r2=e3 ^ e4
-	PXOR X11, X4
+	VPXOR Y9, Y12, Y12
 
-	// e5=^(m3 & k1)
-	MOVOU 176(CX), X12
-	PAND  X2, X12
-	PANDN X0, X12
+	// e5= (m3 & k1)
+	VMOVDQU 352(CX), Y7
+	VPAND   Y11, Y7, Y7
 
 	// r3=e5 ^ e4
-	PXOR X12, X11
+	VPXOR Y7, Y9, Y9
 
 	// store r2 r3
-	MOVOU X4, 384(CX)
-	MOVOU X11, 400(CX)
+	VMOVDQU Y12, 768(CX)
+	VMOVDQU Y9, 800(CX)
 
-	// e6=^(m9 & k4)
-	MOVOU 272(CX), X4
-	PAND  X8, X4
-	PANDN X0, X4
+	// e6=(m9 & k4)
+	VMOVDQU 544(CX), Y12
+	VPAND   Y3, Y12, Y12
 
-	// e7=^(g7 & l3)
-	MOVOU 112(CX), X11
-	PAND  X7, X11
-	PANDN X0, X11
+	// e7=(g7 & l3)
+	VMOVDQU 224(CX), Y9
+	VPAND   Y4, Y9, Y9
 
 	// r4=e7 ^ e6
-	PXOR X11, X4
+	VPXOR Y9, Y12, Y12
 
-	// e8=^(g6 & l2)
-	MOVOU 96(CX), X12
-	PAND  X10, X12
-	PANDN X0, X12
+	// e8=(g6 & l2)
+	VMOVDQU 192(CX), Y7
+	VPAND   Y5, Y7, Y7
 
 	// r5=e8 ^ e6
-	PXOR X12, X11
+	VPXOR Y7, Y9, Y7
 
-	// store r4 r5
-	MOVOU X4, 416(CX)
-	MOVOU X11, 432(CX)
+	// store r4
+	VMOVDQU Y12, 832(CX)
 
-	// e9=^(m0 & k0)
-	MOVOU 128(CX), X4
-	PAND  X1, X4
-	PANDN X0, X4
+	// e9=(m0 & k0)
+	VMOVDQU 256(CX), Y12
+	VPAND   Y6, Y12, Y12
 
-	// e10=^(g1 & l1)
-	MOVOU 16(CX), X1
-	PAND  X3, X1
-	PANDN X0, X1
+	// e10=(g1 & l1)
+	VMOVDQU 32(CX), Y6
+	VPAND   Y6, Y2, Y2
 
 	// r6=e9 ^ e10
-	PXOR X1, X4
+	VPXOR Y12, Y2, Y12
 
-	// e11=^(g0 & l0)
-	MOVOU (CX), X12
-	PAND  X9, X12
-	PANDN X0, X12
+	// e11=(g0 & l0)
+	VMOVDQU (CX), Y9
+	VPAND   Y1, Y9, Y9
 
 	// r7=e11 ^ e10
-	PXOR X12, X1
+	VPXOR Y9, Y2, Y1
 
-	// store r6
-	MOVOU X4, 448(CX)
+	// e12=(m6 & k3)
+	VMOVDQU 448(CX), Y2
+	VPAND   Y2, Y10, Y2
 
-	// e12=^(m6 & k3)
-	MOVOU 224(CX), X3
-	PAND  X6, X3
-	PANDN X0, X3
-
-	// e13=^(m4 & k2)
-	MOVOU 192(CX), X9
-	PAND  X5, X9
-	PANDN X0, X9
+	// e13=(m4 & k2)
+	VMOVDQU 384(CX), Y10
+	VPAND   Y10, Y8, Y8
 
 	// r8=e12 ^ e13
-	PXOR X9, X3
+	VPXOR Y2, Y8, Y2
 
-	// e14=^(m2 & k1)
-	MOVOU 160(CX), X12
-	PAND  X2, X12
-	PANDN X0, X12
+	// e14=(m2 & k1)
+	VMOVDQU 320(CX), Y10
+	VPAND   Y10, Y11, Y10
 
 	// r9=e14 ^ e13
-	PXOR X12, X9
+	VPXOR Y8, Y10, Y6
 
-	// e15=^(m8 & k4)
-	MOVOU 256(CX), X4
-	PAND  X8, X4
-	PANDN X0, X4
+	// e15=(m8 & k4)
+	VMOVDQU 512(CX), Y11
+	VPAND   Y11, Y3, Y11
 
-	// e16=^(g3 & l3)
-	MOVOU 48(CX), X11
-	PAND  X7, X11
-	PANDN X0, X11
+	// e16=(g3 & l3)
+	VMOVDQU 96(CX), Y3
+	VPAND   Y3, Y4, Y4
 
 	// r10=e15 ^ e16
-	PXOR X11, X4
+	VPXOR Y11, Y4, Y8
 
-	// e17=^(g2 & l2)
-	MOVOU 32(CX), X12
-	PAND  X10, X12
-	PANDN X0, X12
+	// e17=(g2 & l2)
+	VMOVDQU 64(CX), Y11
+	VPAND   Y5, Y11, Y5
 
 	// r11=e17 ^ e16
-	PXOR X12, X11
+	VPXOR Y5, Y4, Y5
 
 	// start output function
 	// [t1]=r7 ^ r9
-	PXOR X1, X9
-
-	// t2=t1 ^ r1
-	MOVOU 368(CX), X2
-	PXOR  X9, X2
-
-	// t3=t2 ^ r3
-	MOVOU 400(CX), X5
-	MOVOU X5, X6
-	PXOR  X2, X5
-
-	// t4=r5 ^ r3
-	PXOR  432(CX), X6
-	MOVOU 416(CX), X8
-	MOVOU X8, X10
-
-	// t5=r4 ^ t4
-	PXOR X6, X8
-
-	// t6=r0 ^ t4
-	PXOR 352(CX), X10
-
-	// [t7]=r11 ^ r7
-	PXOR X11, X1
-
-	// [t8]=[t1] ^ t4
-	PXOR X9, X6
-
-	// store t8
-	MOVOU X6, 80(AX)
-
-	// [t9]=[t1] ^ t6
-	PXOR X10, X9
-
-	// store t9
-	MOVOU X9, 32(AX)
-
-	// [t10]=r2 ^ t5
-	PXOR 384(CX), X8
-
-	// [t11]=r10 ^ r8
-	PXOR X4, X3
-
-	// store t11
-	MOVOU X3, 48(AX)
-
-	// [t12]=^(t3 ^ [t11])
-	PXOR  X5, X3
-	PANDN X0, X3
-
-	// store t12
-	MOVOU X3, 16(AX)
-
-	// [t13]=[t10] ^ [t12]
-	PXOR X3, X8
-
-	// store t13
-	MOVOU X8, 96(AX)
-
-	// [t14]=^(t3 ^ [t7])
-	PXOR  X5, X1
-	PANDN X0, X1
-
-	// store t14
-	MOVOU X1, 64(AX)
-
-	// [t16]=t6 ^ [t14]
-	PXOR X10, X1
-
-	// store t16
-	MOVOU X1, (AX)
-
-	// [t15]=^(r10 ^ r6)
-	PXOR  448(CX), X4
-	PANDN X0, X4
-
-	// store t15
-	MOVOU X4, 112(AX)
-	RET
+	VPXOR Y1, Y6, Y6
 
-// func l128(x *byte, buffer *byte)
-// Requires: SSE2
-TEXT ·l128(SB), NOSPLIT, $0-16
-	MOVQ  x+0(FP), AX
-	MOVQ  buffer+8(FP), CX
-	MOVOU (AX), X0
-	MOVOU 128(AX), X1
-	MOVOU 256(AX), X2
-	MOVOU 384(AX), X3
-	MOVOU 288(AX), X5
-	MOVOU 352(AX), X6
-	MOVOU 416(AX), X7
-	MOVOU 480(AX), X8
-	MOVOU 32(AX), X9
+	// [t2]=t1 ^ r1
+	VMOVDQU 736(CX), Y10
+	VPXOR   Y6, Y10, Y10
 
-	// 0=0^24^14^22^30
-	MOVOU X0, X4
-	PXOR  X3, X4
-	PXOR  224(AX), X4
-	PXOR  X6, X4
-	PXOR  X8, X4
-	MOVOU X4, (CX)
+	// [t3]=t2 ^ r3
+	VMOVDQU 800(CX), Y4
+	VPXOR   Y10, Y4, Y3
 
-	// 2=0^2^26^8^16
-	MOVOU X0, X4
-	PXOR  X9, X4
-	PXOR  X7, X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	MOVOU X4, 32(CX)
+	// [t4]=r5 ^ r3
+	VPXOR Y7, Y4, Y4
 
-	// 8=0^8^22^30^6
-	MOVOU X0, X4
-	PXOR  X1, X4
-	PXOR  X6, X4
-	PXOR  X8, X4
-	PXOR  96(AX), X4
-	MOVOU X4, 128(CX)
+	// [t5]=r4 ^ [t4]
+	VMOVDQU 832(CX), Y9
+	VPXOR   Y4, Y9, Y11
 
-	// 18=0^18^10^16^24
-	MOVOU X0, X4
-	PXOR  X5, X4
-	PXOR  160(AX), X4
-	PXOR  X2, X4
-	PXOR  X3, X4
-	MOVOU X4, 288(CX)
+	// [t6]=r0 ^ r4
+	VPXOR 704(CX), Y9, Y9
 
-	// 26=0^26^18^24^8
-	PXOR  X1, X0
-	PXOR  X7, X0
-	PXOR  X5, X0
-	PXOR  X3, X0
-	MOVOU X0, 416(CX)
+	// [t7]=r11 ^ r7
+	VPXOR Y5, Y1, Y1
 
-	// 10=10^2^8^16^24
-	MOVOU X9, X4
-	PXOR  160(AX), X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	PXOR  X3, X4
-	MOVOU X4, 160(CX)
-	MOVOU 96(AX), X0
-	MOVOU 224(AX), X5
+	// [t8]=[t1] ^ [t4]
+	VPXOR Y6, Y4, Y4
 
-	// 16=16^8^30^6^14
-	PXOR  X2, X1
-	PXOR  X8, X1
-	PXOR  X0, X1
-	PXOR  X5, X1
-	MOVOU X1, 256(CX)
+	// store t8
+	VMOVDQU Y4, 160(AX)
 
-	// 24=24^16^6^14^22
-	PXOR  X3, X2
-	PXOR  X0, X2
-	PXOR  X5, X2
-	PXOR  X6, X2
-	MOVOU X2, 384(CX)
-	MOVOU 64(AX), X1
-	MOVOU 160(AX), X2
-	MOVOU 192(AX), X3
+	// [t9]=[t1] ^ [t6]
+	VPXOR Y9, Y6, Y6
 
-	// 4=4^28^2^10^18
-	MOVOU X9, X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	PXOR  288(AX), X4
-	PXOR  448(AX), X4
-	MOVOU X4, 64(CX)
+	// store t9
+	VMOVDQU Y6, 64(AX)
 
-	// 20=20^12^18^26^2
-	MOVOU X9, X4
-	PXOR  320(AX), X4
-	PXOR  X3, X4
-	PXOR  288(AX), X4
-	PXOR  X7, X4
-	MOVOU X4, 320(CX)
+	// [t10]=r2 ^ t5
+	VPXOR 768(CX), Y11, Y11
 
-	// 28=28^20^26^2^10
-	PXOR  448(AX), X9
-	PXOR  320(AX), X9
-	PXOR  X7, X9
-	PXOR  X2, X9
-	MOVOU X9, 448(CX)
-	MOVOU 320(AX), X9
+	// [t11]=r10 ^ r8
+	VPXOR Y8, Y2, Y2
 
-	// 6=6^30^4^12^20
-	MOVOU X1, X4
-	PXOR  X0, X4
-	PXOR  X3, X4
-	PXOR  X8, X4
-	PXOR  X9, X4
-	MOVOU X4, 96(CX)
+	// store t11
+	VMOVDQU Y2, 96(AX)
 
-	// 12=12^4^10^18^26
-	MOVOU X1, X4
-	PXOR  X3, X4
-	PXOR  X2, X4
-	PXOR  288(AX), X4
-	PXOR  X7, X4
-	MOVOU X4, 192(CX)
-	MOVOU 448(AX), X7
+	// [t12]=^([t3] ^ [t11])
+	VPXOR  Y3, Y2, Y2
+	VPANDN Y0, Y2, Y2
 
-	// 22=22^14^20^28^4
-	MOVOU X1, X4
-	PXOR  X5, X4
-	PXOR  X6, X4
-	PXOR  X9, X4
-	PXOR  X7, X4
-	MOVOU X4, 352(CX)
+	// store t12
+	VMOVDQU Y2, 32(AX)
 
-	// 30=30^22^28^4^12
-	PXOR  X8, X1
-	PXOR  X6, X1
-	PXOR  X3, X1
-	PXOR  X7, X1
-	MOVOU X1, 480(CX)
+	// [t13]=[t10] ^ [t12]
+	VPXOR Y2, Y11, Y11
 
-	// 14=14^6^12^20^28
-	PXOR  X3, X0
-	PXOR  X7, X0
-	PXOR  X9, X0
-	PXOR  X5, X0
-	MOVOU X0, 224(CX)
-	MOVOU 16(AX), X0
-	MOVOU 144(AX), X1
-	MOVOU 272(AX), X2
-	MOVOU 400(AX), X3
-	MOVOU 304(AX), X5
-	MOVOU 368(AX), X6
-	MOVOU 432(AX), X7
-	MOVOU 496(AX), X8
-	MOVOU 48(AX), X9
+	// store t13
+	VMOVDQU Y11, 192(AX)
 
-	// 1=1^25^15^23^31
-	MOVOU X0, X4
-	PXOR  X3, X4
-	PXOR  240(AX), X4
-	PXOR  X6, X4
-	PXOR  X8, X4
-	MOVOU X4, 16(CX)
+	// [t14]=^([t3] ^ [t7])
+	VPXOR  Y3, Y1, Y1
+	VPANDN Y0, Y1, Y1
 
-	// 3=3^27^1^9^17
-	MOVOU X0, X4
-	PXOR  X9, X4
-	PXOR  X7, X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	MOVOU X4, 48(CX)
+	// store t14
+	VMOVDQU Y1, 128(AX)
 
-	// 9=9^1^23^31^7
-	MOVOU X0, X4
-	PXOR  X1, X4
-	PXOR  X6, X4
-	PXOR  X8, X4
-	PXOR  112(AX), X4
-	MOVOU X4, 144(CX)
+	// [t16]=[t6] ^ [t14]
+	VPXOR Y9, Y1, Y1
 
-	// 19=1^19^11^17^25
-	MOVOU X0, X4
-	PXOR  X5, X4
-	PXOR  176(AX), X4
-	PXOR  X2, X4
-	PXOR  X3, X4
-	MOVOU X4, 304(CX)
+	// store t16
+	VMOVDQU Y1, (AX)
 
-	// 27=1^27^19^25^9
-	PXOR  X1, X0
-	PXOR  X7, X0
-	PXOR  X5, X0
-	PXOR  X3, X0
-	MOVOU X0, 432(CX)
+	// [t15]=^(r10 ^ r6)
+	VPXOR  Y12, Y8, Y8
+	VPANDN Y0, Y8, Y8
 
-	// 11=11^3^9^17^25
-	MOVOU X9, X4
-	PXOR  176(AX), X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	PXOR  X3, X4
-	MOVOU X4, 176(CX)
-	MOVOU 112(AX), X0
-	MOVOU 240(AX), X5
+	// store t15
+	VMOVDQU Y8, 224(AX)
+	VZEROUPPER
+	RET
 
-	// 17=17^9^31^7^15
-	PXOR  X2, X1
-	PXOR  X8, X1
-	PXOR  X0, X1
-	PXOR  X5, X1
-	MOVOU X1, 272(CX)
+// func xorRoundKey256avx2(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte)
+// Requires: AVX, AVX2
+TEXT ·xorRoundKey256avx2(SB), NOSPLIT, $0-40
+	MOVL     rk+0(FP), AX
+	MOVQ     x1+8(FP), CX
+	MOVQ     x2+16(FP), DX
+	MOVQ     x3+24(FP), BX
+	MOVQ     out+32(FP), SI
+	VPCMPEQB Y1, Y1, Y1
+	XORQ     R8, R8
 
-	// 25=25^17^7^15^23
-	PXOR  X3, X2
-	PXOR  X0, X2
-	PXOR  X5, X2
-	PXOR  X6, X2
-	MOVOU X2, 400(CX)
-	MOVOU 80(AX), X1
-	MOVOU 176(AX), X2
-	MOVOU 208(AX), X3
+	// Handle first byte
+	MOVL $0x01000000, DI
 
-	// 5=5^29^3^11^19
-	MOVOU X9, X4
-	PXOR  X1, X4
-	PXOR  X2, X4
-	PXOR  304(AX), X4
-	PXOR  464(AX), X4
-	MOVOU X4, 80(CX)
+rk_loop_1:
+	VMOVDQU (CX)(R8*1), Y0
+	VPXOR   (DX)(R8*1), Y0, Y0
+	VPXOR   (BX)(R8*1), Y0, Y0
+	TESTL   AX, DI
+	JZ      rk_loop_1_c
+	VPXOR   Y1, Y0, Y0
 
-	// 21=21^13^19^27^3
-	MOVOU X9, X4
-	PXOR  336(AX), X4
-	PXOR  X3, X4
-	PXOR  304(AX), X4
-	PXOR  X7, X4
-	MOVOU X4, 336(CX)
+rk_loop_1_c:
+	VMOVDQU Y0, (SI)(R8*1)
+	ROLL    $0x01, DI
+	ADDQ    $0x20, R8
+	CMPQ    R8, $0x00000100
+	JL      rk_loop_1
 
-	// 29=29^21^27^3^11
-	PXOR  464(AX), X9
-	PXOR  336(AX), X9
-	PXOR  X7, X9
-	PXOR  X2, X9
-	MOVOU X9, 464(CX)
-	MOVOU 336(AX), X9
+	// Handle second byte
+	MOVL $0x00010000, DI
 
-	// 7=7^31^5^13^21
-	MOVOU X1, X4
-	PXOR  X0, X4
-	PXOR  X3, X4
-	PXOR  X8, X4
-	PXOR  X9, X4
-	MOVOU X4, 112(CX)
+rk_loop_2:
+	VMOVDQU (CX)(R8*1), Y0
+	VPXOR   (DX)(R8*1), Y0, Y0
+	VPXOR   (BX)(R8*1), Y0, Y0
+	TESTL   AX, DI
+	JZ      rk_loop_2_c
+	VPXOR   Y1, Y0, Y0
 
-	// 13=13^5^11^19^27
-	MOVOU X1, X4
-	PXOR  X3, X4
-	PXOR  X2, X4
-	PXOR  304(AX), X4
-	PXOR  X7, X4
-	MOVOU X4, 208(CX)
-	MOVOU 464(AX), X7
+rk_loop_2_c:
+	VMOVDQU Y0, (SI)(R8*1)
+	ROLL    $0x01, DI
+	ADDQ    $0x20, R8
+	CMPQ    R8, $0x00000200
+	JL      rk_loop_2
 
-	// 23=23^15^21^29^5
-	MOVOU X1, X4
-	PXOR  X5, X4
-	PXOR  X6, X4
-	PXOR  X9, X4
-	PXOR  X7, X4
-	MOVOU X4, 368(CX)
+	// Handle third byte
+	MOVL $0x00000100, DI
 
-	// 31=31^23^29^5^13
-	PXOR  X8, X1
-	PXOR  X6, X1
-	PXOR  X3, X1
-	PXOR  X7, X1
-	MOVOU X1, 496(CX)
+rk_loop_3:
+	VMOVDQU (CX)(R8*1), Y0
+	VPXOR   (DX)(R8*1), Y0, Y0
+	VPXOR   (BX)(R8*1), Y0, Y0
+	TESTL   AX, DI
+	JZ      rk_loop_3_c
+	VPXOR   Y1, Y0, Y0
 
-	// 15=15^7^13^21^29
-	PXOR  X3, X0
-	PXOR  X7, X0
-	PXOR  X9, X0
-	PXOR  X5, X0
-	MOVOU X0, 240(CX)
+rk_loop_3_c:
+	VMOVDQU Y0, (SI)(R8*1)
+	ROLL    $0x01, DI
+	ADDQ    $0x20, R8
+	CMPQ    R8, $0x00000300
+	JL      rk_loop_3
+
+	// Handle last byte
+	MOVL $0x00000001, DI
+
+rk_loop_4:
+	VMOVDQU (CX)(R8*1), Y0
+	VPXOR   (DX)(R8*1), Y0, Y0
+	VPXOR   (BX)(R8*1), Y0, Y0
+	TESTL   AX, DI
+	JZ      rk_loop_4_c
+	VPXOR   Y1, Y0, Y0
+
+rk_loop_4_c:
+	VMOVDQU Y0, (SI)(R8*1)
+	ROLL    $0x01, DI
+	ADDQ    $0x20, R8
+	CMPQ    R8, $0x00000400
+	JL      rk_loop_4
 	RET
diff --git a/transpose_amd64_test.go b/transpose_amd64_test.go
index 1d1bbe5..6fa55db 100644
--- a/transpose_amd64_test.go
+++ b/transpose_amd64_test.go
@@ -102,3 +102,50 @@ func BenchmarkBS128TransposeRevAvx(b *testing.B) {
 		transpose128RevAvx(&input[0], &out[0])
 	}
 }
+
+func TestBS256TransposeRev(t *testing.T) {
+	in := make([]byte, 256*16)
+	ret := make([]byte, 256*16)
+	out := make([]byte, 256*16)
+
+	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
+	for i := 0; i < 256; i++ {
+		copy(in[i*16:], key)
+	}
+
+	transpose256avx(&in[0], &out[0])
+	transpose128x256avx2(&out[0], &ret[0])
+	if !bytes.Equal(in, ret) {
+		t.Fatalf("not expected %x", ret[:16])
+	}
+}
+
+func BenchmarkBS256TransposeAvx(b *testing.B) {
+	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
+	input := make([]byte, 256*16)
+	for i := 0; i < 256; i++ {
+		copy(input[i*16:], key)
+	}
+	out := make([]byte, 256*16)
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		transpose256avx(&input[0], &out[0])
+	}
+}
+
+func BenchmarkBS256TransposeRevAvx(b *testing.B) {
+	key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
+	input := make([]byte, 256*16)
+	for i := 0; i < 256; i++ {
+		copy(input[i*16:], key)
+	}
+	out := make([]byte, 256*16)
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		transpose256RevAvx(&input[0], &out[0])
+	}
+}