From 6a940afdb720bc761a419bef6afab287756d33e8 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Tue, 30 May 2023 15:26:03 +0800 Subject: [PATCH] 256 avx2 and optimize sbox via reduce logic operations --- _asm/bs_amd64_asm.go | 3605 ++++++++++++++++++++++--------- bs128.go | 2 +- bs128_test.go | 10 +- bs_amd64.go | 20 +- bs_amd64.s | 4469 +++++++++++++++++++++++++++++++++------ transpose_amd64_test.go | 47 + 6 files changed, 6420 insertions(+), 1733 deletions(-) diff --git a/_asm/bs_amd64_asm.go b/_asm/bs_amd64_asm.go index ebc0f45..c79cfe7 100644 --- a/_asm/bs_amd64_asm.go +++ b/_asm/bs_amd64_asm.go @@ -203,7 +203,7 @@ func transpose128() { RET() } -func getFirst4Bytes(flipMask, in Mem, o, addr, x Register) { +func getFirst4Bytes128(flipMask, in Mem, o, addr, x Register) { for i := 0; i < 4; i++ { MOVL(in.Idx(addr, 1), o) PINSRD(Imm(uint64(i)), o, x) @@ -212,6 +212,15 @@ func getFirst4Bytes(flipMask, in Mem, o, addr, x Register) { PSHUFB(flipMask, x) } +func getFirst4Bytes256(flipMask, in Mem, o, addr, x Register) { + for i := 0; i < 4; i++ { + MOVL(in.Idx(addr, 1), o) + PINSRD(Imm(uint64(i)), o, x) + ADDQ(Imm(32), addr) + } + PSHUFB(flipMask, x) +} + func transpose128avx(flipMask Mem) { // transpose128avx function TEXT("transpose128avx", NOSPLIT, "func(in, out *byte)") @@ -242,14 +251,14 @@ func transpose128avx(flipMask Mem) { SHRQ(Imm(3), addr) Comment("Construct eight XMM with first 4 bytes of first 32 rows") - getFirst4Bytes(flipMask, in, o, addr, t1) - getFirst4Bytes(flipMask, in, o, addr, t2) - getFirst4Bytes(flipMask, in, o, addr, t3) - getFirst4Bytes(flipMask, in, o, addr, t4) - getFirst4Bytes(flipMask, in, o, addr, t5) - getFirst4Bytes(flipMask, in, o, addr, t6) - getFirst4Bytes(flipMask, in, o, addr, t7) - getFirst4Bytes(flipMask, in, o, addr, t8) + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) Comment("Matrix transform 4x4") VPUNPCKHDQ(t2, t1, h) @@ -366,250 +375,198 @@ func transpose128avx(flipMask Mem) { RET() } -func transpose128Rev() { - // transpose128Rev function - TEXT("transpose128Rev", NOSPLIT, "func(in, out *byte)") - Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128") +func transpose256avx(flipMask Mem) { + // transpose256avx function + TEXT("transpose256avx", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, 256x128 => 128x256") in := Mem{Base: Load(Param("in"), GP64())} out := Mem{Base: Load(Param("out"), GP64())} - tmp := XMM() - b := GP8() + h, l := X1, X0 + t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + tmp := Y0 o := GP32() - - Comment("Initialize rr, current row, 96") - rr := zero() cc := GP64() - addr := GP64() - Label("row_loop_b3") + Comment("Initialize rr, current row") + rr := zero() + Label("row_loop") Comment("Initialize cc, current col") XORQ(cc, cc) - Label("col_loop_b3") + Label("col_loop") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + addr := GP64() MOVQ(rr, addr) - ADDQ(Imm(96), addr) Comment("Multiple with ncols") SHLQ(Imm(7), addr) ADDQ(cc, addr) SHRQ(Imm(3), addr) - Comment("Construct one XMM with first byte of first 16 rows") - for i := 0; i < 16; i++ { - MOVB(in.Idx(addr, 1), b) - PINSRB(Imm(uint64(i)), b.As32(), tmp) - Comment("Add ncols / 8") - ADDQ(Imm(16), addr) - } + Comment("Construct eight XMM with first 4 bytes of first 32 rows") + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) - Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) + + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) + + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256") MOVQ(cc, addr) ADDQ(Imm(7), addr) Comment("Multiple with nrows") - SHLQ(Imm(7), addr) + SHLQ(Imm(8), addr) ADDQ(rr, addr) SHRQ(Imm(3), addr) - Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { - PMOVMSKB(tmp, o) - MOVW(o.As16(), out.Idx(addr, 1)) - PSLLQ(Imm(1), tmp) + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) Comment("Sub nrows / 8") - SUBQ(Imm(16), addr) + SUBQ(Imm(32), addr) } - - Comment("Compare cc with ncols, here ncols=128") ADDQ(Imm(8), cc) - CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b3")) - - Comment("Compare rr with nrows, here nrows=128") - ADDQ(Imm(16), rr) - CMPQ(rr, U8(32)) - JL(LabelRef("row_loop_b3")) - - Label("row_loop_b2") - Comment("Initialize cc, current col") - XORQ(cc, cc) - Label("col_loop_b2") - Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(rr, addr) - ADDQ(Imm(32), addr) - Comment("Multiple with ncols") - SHLQ(Imm(7), addr) - ADDQ(cc, addr) - SHRQ(Imm(3), addr) - Comment("Construct one XMM with first byte of first 16 rows") - for i := 0; i < 16; i++ { - MOVB(in.Idx(addr, 1), b) - PINSRB(Imm(uint64(i)), b.As32(), tmp) - Comment("Add ncols / 8") - ADDQ(Imm(16), addr) - } + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) - Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256") MOVQ(cc, addr) ADDQ(Imm(7), addr) Comment("Multiple with nrows") - SHLQ(Imm(7), addr) + SHLQ(Imm(8), addr) ADDQ(rr, addr) SHRQ(Imm(3), addr) - Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { - PMOVMSKB(tmp, o) - MOVW(o.As16(), out.Idx(addr, 1)) - PSLLQ(Imm(1), tmp) + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) Comment("Sub nrows / 8") - SUBQ(Imm(16), addr) + SUBQ(Imm(32), addr) } - - Comment("Compare cc with ncols, here ncols=128") ADDQ(Imm(8), cc) - CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b2")) - - Comment("Compare rr with nrows, here nrows=128") - ADDQ(Imm(16), rr) - CMPQ(rr, U8(64)) - JL(LabelRef("row_loop_b2")) - - Label("row_loop_b1") - Comment("Initialize cc, current col") - XORQ(cc, cc) - Label("col_loop_b1") - Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(rr, addr) - SUBQ(Imm(32), addr) - Comment("Multiple with ncols") - SHLQ(Imm(7), addr) - ADDQ(cc, addr) - SHRQ(Imm(3), addr) - Comment("Construct one XMM with first byte of first 16 rows") - for i := 0; i < 16; i++ { - MOVB(in.Idx(addr, 1), b) - PINSRB(Imm(uint64(i)), b.As32(), tmp) - Comment("Add ncols / 8") - ADDQ(Imm(16), addr) - } + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) - Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256") MOVQ(cc, addr) ADDQ(Imm(7), addr) Comment("Multiple with nrows") - SHLQ(Imm(7), addr) + SHLQ(Imm(8), addr) ADDQ(rr, addr) SHRQ(Imm(3), addr) - Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { - PMOVMSKB(tmp, o) - MOVW(o.As16(), out.Idx(addr, 1)) - PSLLQ(Imm(1), tmp) + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) Comment("Sub nrows / 8") - SUBQ(Imm(16), addr) + SUBQ(Imm(32), addr) } - - Comment("Compare cc with ncols, here ncols=128") ADDQ(Imm(8), cc) - CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b1")) - - Comment("Compare rr with nrows, here nrows=128") - ADDQ(Imm(16), rr) - CMPQ(rr, U8(96)) - JL(LabelRef("row_loop_b1")) - - Label("row_loop_b0") - Comment("Initialize cc, current col") - XORQ(cc, cc) - Label("col_loop_b0") - Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(rr, addr) - SUBQ(Imm(96), addr) - Comment("Multiple with ncols") - SHLQ(Imm(7), addr) - ADDQ(cc, addr) - SHRQ(Imm(3), addr) - Comment("Construct one XMM with first byte of first 16 rows") - for i := 0; i < 16; i++ { - MOVB(in.Idx(addr, 1), b) - PINSRB(Imm(uint64(i)), b.As32(), tmp) - Comment("Add ncols / 8") - ADDQ(Imm(16), addr) - } + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) - Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256") MOVQ(cc, addr) ADDQ(Imm(7), addr) Comment("Multiple with nrows") - SHLQ(Imm(7), addr) + SHLQ(Imm(8), addr) ADDQ(rr, addr) SHRQ(Imm(3), addr) - Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { - PMOVMSKB(tmp, o) - MOVW(o.As16(), out.Idx(addr, 1)) - PSLLQ(Imm(1), tmp) + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) Comment("Sub nrows / 8") - SUBQ(Imm(16), addr) + SUBQ(Imm(32), addr) } - ADDQ(Imm(8), cc) Comment("Compare cc with ncols, here ncols=128") CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b0")) - ADDQ(Imm(16), rr) - Comment("Compare rr with nrows, here nrows=128") - CMPQ(rr, U8(128)) - JL(LabelRef("row_loop_b0")) + JL(LabelRef("col_loop")) + ADDQ(Imm(32), rr) + Comment("Compare rr with nrows, here nrows=256") + CMPQ(rr, U32(256)) + JL(LabelRef("row_loop")) + VZEROUPPER() RET() } -func transpose128RevAvx(flipMask Mem) { - // transpose128RevAvx function - TEXT("transpose128RevAvx", NOSPLIT, "func(in, out *byte)") - Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128") +func transpose128x256avx2(flipMask Mem) { + // transpose128x256avx2 function + TEXT("transpose128x256avx2", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, 128x256 => 256x128, just for test here.") in := Mem{Base: Load(Param("in"), GP64())} out := Mem{Base: Load(Param("out"), GP64())} h, l := X1, X0 - tmp := Y0 t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + tmp := Y0 o := GP32() - - Comment("Initialize rr, current row, 96") - rr := zero() cc := GP64() - addr := GP64() - Label("row_loop_b3") + Comment("Initialize rr, current row") + rr := zero() + Label("row_loop") Comment("Initialize cc, current col") XORQ(cc, cc) - Label("col_loop_b3") - Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(U32(12288), addr) + Label("col_loop") + + Comment("Initialize (rr * ncols + cc) / 8, here ncols=256") + addr := GP64() + MOVQ(rr, addr) + Comment("Multiple with ncols") + SHLQ(Imm(8), addr) ADDQ(cc, addr) SHRQ(Imm(3), addr) - Comment("Construct eight XMM with first 4 bytes of the 32 rows") - getFirst4Bytes(flipMask, in, o, addr, t1) - getFirst4Bytes(flipMask, in, o, addr, t2) - getFirst4Bytes(flipMask, in, o, addr, t3) - getFirst4Bytes(flipMask, in, o, addr, t4) - getFirst4Bytes(flipMask, in, o, addr, t5) - getFirst4Bytes(flipMask, in, o, addr, t6) - getFirst4Bytes(flipMask, in, o, addr, t7) - getFirst4Bytes(flipMask, in, o, addr, t8) + Comment("Construct eight XMM with first 4 bytes of first 32 rows") + getFirst4Bytes256(flipMask, in, o, addr, t1) + getFirst4Bytes256(flipMask, in, o, addr, t2) + getFirst4Bytes256(flipMask, in, o, addr, t3) + getFirst4Bytes256(flipMask, in, o, addr, t4) + getFirst4Bytes256(flipMask, in, o, addr, t5) + getFirst4Bytes256(flipMask, in, o, addr, t6) + getFirst4Bytes256(flipMask, in, o, addr, t7) + getFirst4Bytes256(flipMask, in, o, addr, t8) Comment("Matrix transform 4x4") VPUNPCKHDQ(t2, t1, h) @@ -632,10 +589,14 @@ func transpose128RevAvx(flipMask Mem) { MOVOU(t1, l) VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") MOVQ(cc, addr) ADDQ(Imm(7), addr) - SHLQ(Imm(4), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -649,10 +610,14 @@ func transpose128RevAvx(flipMask Mem) { MOVOU(t2, l) VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") MOVQ(cc, addr) ADDQ(Imm(7), addr) - SHLQ(Imm(4), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -666,10 +631,14 @@ func transpose128RevAvx(flipMask Mem) { MOVOU(t3, l) VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") MOVQ(cc, addr) ADDQ(Imm(7), addr) - SHLQ(Imm(4), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -683,10 +652,14 @@ func transpose128RevAvx(flipMask Mem) { MOVOU(t4, l) VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") MOVQ(cc, addr) ADDQ(Imm(7), addr) - SHLQ(Imm(4), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -698,29 +671,262 @@ func transpose128RevAvx(flipMask Mem) { } ADDQ(Imm(8), cc) - Comment("Compare cc with ncols, here ncols=128") - CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b3")) - + Comment("Compare cc with ncols, here ncols=256") + CMPQ(cc, U32(256)) + JL(LabelRef("col_loop")) ADDQ(Imm(32), rr) + Comment("Compare rr with nrows, here nrows=128") + CMPQ(rr, U32(128)) + JL(LabelRef("row_loop")) + + VZEROUPPER() + RET() +} + +func transpose128Rev() { + // transpose128Rev function + TEXT("transpose128Rev", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128") + + in := Mem{Base: Load(Param("in"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + + tmp := XMM() + b := GP8() + o := GP32() + + Comment("Initialize rr, current row, 96") + rr := zero() + cc := GP64() + addr := GP64() + + Label("row_loop_b3") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b3") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + ADDQ(Imm(96), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b3")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(32)) + JL(LabelRef("row_loop_b3")) + Label("row_loop_b2") Comment("Initialize cc, current col") XORQ(cc, cc) Label("col_loop_b2") Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(U32(8192), addr) + MOVQ(rr, addr) + ADDQ(Imm(32), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b2")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(64)) + JL(LabelRef("row_loop_b2")) + + Label("row_loop_b1") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b1") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + SUBQ(Imm(32), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + Comment("Compare cc with ncols, here ncols=128") + ADDQ(Imm(8), cc) + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b1")) + + Comment("Compare rr with nrows, here nrows=128") + ADDQ(Imm(16), rr) + CMPQ(rr, U8(96)) + JL(LabelRef("row_loop_b1")) + + Label("row_loop_b0") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b0") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(rr, addr) + SUBQ(Imm(96), addr) + Comment("Multiple with ncols") + SHLQ(Imm(7), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct one XMM with first byte of first 16 rows") + for i := 0; i < 16; i++ { + MOVB(in.Idx(addr, 1), b) + PINSRB(Imm(uint64(i)), b.As32(), tmp) + Comment("Add ncols / 8") + ADDQ(Imm(16), addr) + } + + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + Comment("Multiple with nrows") + SHLQ(Imm(7), addr) + ADDQ(rr, addr) + SHRQ(Imm(3), addr) + + Comment("Get the most significant bit of each 8-bit element in the XMM, and store the returned 2 bytes") + for i := 7; i >= 0; i-- { + PMOVMSKB(tmp, o) + MOVW(o.As16(), out.Idx(addr, 1)) + PSLLQ(Imm(1), tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=128") + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b0")) + ADDQ(Imm(16), rr) + Comment("Compare rr with nrows, here nrows=128") + CMPQ(rr, U8(128)) + JL(LabelRef("row_loop_b0")) + + RET() +} + +func transpose128RevAvx(flipMask Mem) { + // transpose128RevAvx function + TEXT("transpose128RevAvx", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x128") + + in := Mem{Base: Load(Param("in"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + + h, l := X1, X0 + tmp := Y0 + t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + o := GP32() + + Comment("Initialize rr, current row, 96") + rr := zero() + cc := GP64() + addr := GP64() + + Label("row_loop_b3") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b3") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(U32(12288), addr) ADDQ(cc, addr) SHRQ(Imm(3), addr) Comment("Construct eight XMM with first 4 bytes of the 32 rows") - getFirst4Bytes(flipMask, in, o, addr, t1) - getFirst4Bytes(flipMask, in, o, addr, t2) - getFirst4Bytes(flipMask, in, o, addr, t3) - getFirst4Bytes(flipMask, in, o, addr, t4) - getFirst4Bytes(flipMask, in, o, addr, t5) - getFirst4Bytes(flipMask, in, o, addr, t6) - getFirst4Bytes(flipMask, in, o, addr, t7) - getFirst4Bytes(flipMask, in, o, addr, t8) + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) Comment("Matrix transform 4x4") VPUNPCKHDQ(t2, t1, h) @@ -747,7 +953,6 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -765,7 +970,6 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -783,7 +987,6 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -801,7 +1004,6 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -815,28 +1017,27 @@ func transpose128RevAvx(flipMask Mem) { Comment("Compare cc with ncols, here ncols=128") CMPQ(cc, Imm(128)) - JL(LabelRef("col_loop_b2")) + JL(LabelRef("col_loop_b3")) ADDQ(Imm(32), rr) - - Label("row_loop_b1") + Label("row_loop_b2") Comment("Initialize cc, current col") XORQ(cc, cc) - Label("col_loop_b1") + Label("col_loop_b2") Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") - MOVQ(U32(4096), addr) + MOVQ(U32(8192), addr) ADDQ(cc, addr) SHRQ(Imm(3), addr) Comment("Construct eight XMM with first 4 bytes of the 32 rows") - getFirst4Bytes(flipMask, in, o, addr, t1) - getFirst4Bytes(flipMask, in, o, addr, t2) - getFirst4Bytes(flipMask, in, o, addr, t3) - getFirst4Bytes(flipMask, in, o, addr, t4) - getFirst4Bytes(flipMask, in, o, addr, t5) - getFirst4Bytes(flipMask, in, o, addr, t6) - getFirst4Bytes(flipMask, in, o, addr, t7) - getFirst4Bytes(flipMask, in, o, addr, t8) + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) Comment("Matrix transform 4x4") VPUNPCKHDQ(t2, t1, h) @@ -863,7 +1064,7 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(8), addr) + ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -881,7 +1082,7 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(8), addr) + ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -899,7 +1100,7 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(8), addr) + ADDQ(Imm(4), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -917,7 +1118,123 @@ func transpose128RevAvx(flipMask Mem) { MOVQ(cc, addr) ADDQ(Imm(7), addr) SHLQ(Imm(4), addr) - ADDQ(Imm(8), addr) + ADDQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=128") + CMPQ(cc, Imm(128)) + JL(LabelRef("col_loop_b2")) + + ADDQ(Imm(32), rr) + + Label("row_loop_b1") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b1") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=128") + MOVQ(U32(4096), addr) + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct eight XMM with first 4 bytes of the 32 rows") + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) + + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) + + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) + + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") for i := 7; i >= 0; i-- { @@ -943,14 +1260,14 @@ func transpose128RevAvx(flipMask Mem) { SHRQ(Imm(3), addr) Comment("Construct eight XMM with first 4 bytes of first 32 rows") - getFirst4Bytes(flipMask, in, o, addr, t1) - getFirst4Bytes(flipMask, in, o, addr, t2) - getFirst4Bytes(flipMask, in, o, addr, t3) - getFirst4Bytes(flipMask, in, o, addr, t4) - getFirst4Bytes(flipMask, in, o, addr, t5) - getFirst4Bytes(flipMask, in, o, addr, t6) - getFirst4Bytes(flipMask, in, o, addr, t7) - getFirst4Bytes(flipMask, in, o, addr, t8) + getFirst4Bytes128(flipMask, in, o, addr, t1) + getFirst4Bytes128(flipMask, in, o, addr, t2) + getFirst4Bytes128(flipMask, in, o, addr, t3) + getFirst4Bytes128(flipMask, in, o, addr, t4) + getFirst4Bytes128(flipMask, in, o, addr, t5) + getFirst4Bytes128(flipMask, in, o, addr, t6) + getFirst4Bytes128(flipMask, in, o, addr, t7) + getFirst4Bytes128(flipMask, in, o, addr, t8) Comment("Matrix transform 4x4") VPUNPCKHDQ(t2, t1, h) @@ -1051,89 +1368,567 @@ func transpose128RevAvx(flipMask Mem) { RET() } -func xor32x128() { - // xor32x128 function - TEXT("xor32x128", NOSPLIT, "func(x, y, out *byte)") - Doc("out = x xor y") - x := Mem{Base: Load(Param("x"), GP64())} - y := Mem{Base: Load(Param("y"), GP64())} +func transpose256RevAvx(flipMask Mem) { + // transpose256RevAvx function + TEXT("transpose256RevAvx", NOSPLIT, "func(in, out *byte)") + Doc("Bit level matrix transpose, b0-b1-b2-b3, 128x256") + + in := Mem{Base: Load(Param("in"), GP64())} out := Mem{Base: Load(Param("out"), GP64())} - X := XMM() - Y := XMM() + h, l := X1, X0 + tmp := Y0 + t1, t2, t3, t4, t5, t6, t7, t8 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + o := GP32() - count := zero() - Label("xor32_loop") - MOVOU(x.Idx(count, 1), X) - MOVOU(y.Idx(count, 1), Y) - PXOR(X, Y) - MOVOU(Y, out.Idx(count, 1)) - ADDQ(U8(16), count) - CMPQ(count, U32(512)) - JL(LabelRef("xor32_loop")) + Comment("Initialize rr, current row, 96") + rr := zero() + cc := GP64() + addr := GP64() - RET() -} + Label("row_loop_b3") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b3") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=256") + MOVQ(U32(24576), addr) // 96 * ncols + ADDQ(cc, addr) + SHRQ(Imm(3), addr) -func xor32x128avx() { - // xor32x128 function - TEXT("xor32x128avx", NOSPLIT, "func(x, y, out *byte)") - Doc("out = x xor y") - x := Mem{Base: Load(Param("x"), GP64())} - y := Mem{Base: Load(Param("y"), GP64())} - out := Mem{Base: Load(Param("out"), GP64())} + Comment("Construct eight XMM with first 4 bytes of the 32 rows") + getFirst4Bytes256(flipMask, in, o, addr, t1) + getFirst4Bytes256(flipMask, in, o, addr, t2) + getFirst4Bytes256(flipMask, in, o, addr, t3) + getFirst4Bytes256(flipMask, in, o, addr, t4) + getFirst4Bytes256(flipMask, in, o, addr, t5) + getFirst4Bytes256(flipMask, in, o, addr, t6) + getFirst4Bytes256(flipMask, in, o, addr, t7) + getFirst4Bytes256(flipMask, in, o, addr, t8) - X := YMM() - Y := YMM() + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) - count := zero() - Label("xor32_loop_avx") - VMOVDQU(x.Idx(count, 1), X) - VMOVDQU(y.Idx(count, 1), Y) - VPXOR(X, Y, Y) - VMOVDQU(Y, out.Idx(count, 1)) - ADDQ(U8(32), count) - CMPQ(count, U32(512)) - JL(LabelRef("xor32_loop_avx")) + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) - VZEROUPPER() - RET() -} + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) -func xorRoundKey128() { - // xorRoundKey128 function - TEXT("xorRoundKey128", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)") - Doc("xor x1, x2, x3 with round key, 16 bytes per bit") + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) - x := Load(Param("rk"), GP32()) - x1 := Mem{Base: Load(Param("x1"), GP64())} - x2 := Mem{Base: Load(Param("x2"), GP64())} - x3 := Mem{Base: Load(Param("x3"), GP64())} - out := Mem{Base: Load(Param("out"), GP64())} + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) - ret := XMM() - one := XMM() - PCMPEQB(one, one) + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) - y := GP32() + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) - count := GP64() - XORQ(count, count) - Comment("Handle first byte") - MOVL(U32(0x01000000), y) - Label("rk_loop_1") - MOVOU(x1.Idx(count, 1), ret) - PXOR(x2.Idx(count, 1), ret) - PXOR(x3.Idx(count, 1), ret) - TESTL(x, y) - JZ(LabelRef("rk_loop_1_c")) - PXOR(one, ret) - Label("rk_loop_1_c") - MOVOU(ret, out.Idx(count, 1)) - ROLL(U8(1), y) - ADDQ(U8(16), count) - CMPQ(count, U32(128)) - JL(LabelRef("rk_loop_1")) + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=256") + CMPQ(cc, U32(256)) + JL(LabelRef("col_loop_b3")) + + ADDQ(Imm(32), rr) + Label("row_loop_b2") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b2") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=256") + MOVQ(U32(16384), addr) // 64 * ncols + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct eight XMM with first 4 bytes of the 32 rows") + getFirst4Bytes256(flipMask, in, o, addr, t1) + getFirst4Bytes256(flipMask, in, o, addr, t2) + getFirst4Bytes256(flipMask, in, o, addr, t3) + getFirst4Bytes256(flipMask, in, o, addr, t4) + getFirst4Bytes256(flipMask, in, o, addr, t5) + getFirst4Bytes256(flipMask, in, o, addr, t6) + getFirst4Bytes256(flipMask, in, o, addr, t7) + getFirst4Bytes256(flipMask, in, o, addr, t8) + + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) + + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) + + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(4), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=256") + CMPQ(cc, U32(256)) + JL(LabelRef("col_loop_b2")) + + ADDQ(Imm(32), rr) + + Label("row_loop_b1") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b1") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=256") + MOVQ(U32(8192), addr) // 32 * ncols + ADDQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct eight XMM with first 4 bytes of the 32 rows") + getFirst4Bytes256(flipMask, in, o, addr, t1) + getFirst4Bytes256(flipMask, in, o, addr, t2) + getFirst4Bytes256(flipMask, in, o, addr, t3) + getFirst4Bytes256(flipMask, in, o, addr, t4) + getFirst4Bytes256(flipMask, in, o, addr, t5) + getFirst4Bytes256(flipMask, in, o, addr, t6) + getFirst4Bytes256(flipMask, in, o, addr, t7) + getFirst4Bytes256(flipMask, in, o, addr, t8) + + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) + + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) + + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(8), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=256") + CMPQ(cc, U32(256)) + JL(LabelRef("col_loop_b1")) + + ADDQ(Imm(32), rr) + Label("row_loop_b0") + Comment("Initialize cc, current col") + XORQ(cc, cc) + Label("col_loop_b0") + Comment("Initialize (rr * ncols + cc) / 8, here ncols=256") + MOVQ(cc, addr) + SHRQ(Imm(3), addr) + + Comment("Construct eight XMM with first 4 bytes of first 32 rows") + getFirst4Bytes256(flipMask, in, o, addr, t1) + getFirst4Bytes256(flipMask, in, o, addr, t2) + getFirst4Bytes256(flipMask, in, o, addr, t3) + getFirst4Bytes256(flipMask, in, o, addr, t4) + getFirst4Bytes256(flipMask, in, o, addr, t5) + getFirst4Bytes256(flipMask, in, o, addr, t6) + getFirst4Bytes256(flipMask, in, o, addr, t7) + getFirst4Bytes256(flipMask, in, o, addr, t8) + + Comment("Matrix transform 4x4") + VPUNPCKHDQ(t2, t1, h) + VPUNPCKLDQ(t2, t1, t1) + VPUNPCKLDQ(t4, t3, l) + VPUNPCKHDQ(t4, t3, t3) + VPUNPCKHQDQ(l, t1, t2) + VPUNPCKLQDQ(l, t1, t1) + VPUNPCKHQDQ(t3, h, t4) + VPUNPCKLQDQ(t3, h, t3) + + VPUNPCKHDQ(t6, t5, h) + VPUNPCKLDQ(t6, t5, t5) + VPUNPCKLDQ(t8, t7, l) + VPUNPCKHDQ(t8, t7, t7) + VPUNPCKHQDQ(l, t5, t6) + VPUNPCKLQDQ(l, t5, t5) + VPUNPCKHQDQ(t7, h, t8) + VPUNPCKLQDQ(t7, h, t7) + + MOVOU(t1, l) + VINSERTI128(Imm(1), t5, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(12), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t2, l) + VINSERTI128(Imm(1), t6, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(12), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t3, l) + VINSERTI128(Imm(1), t7, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(12), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + MOVOU(t4, l) + VINSERTI128(Imm(1), t8, tmp, tmp) + Comment("Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128") + MOVQ(cc, addr) + ADDQ(Imm(7), addr) + SHLQ(Imm(4), addr) + ADDQ(Imm(12), addr) + + Comment("Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes") + for i := 7; i >= 0; i-- { + VPMOVMSKB(tmp, o) + MOVL(o, out.Idx(addr, 1)) + VPSLLQ(Imm(1), tmp, tmp) + Comment("Sub nrows / 8") + SUBQ(Imm(16), addr) + } + ADDQ(Imm(8), cc) + + Comment("Compare cc with ncols, here ncols=256") + CMPQ(cc, U32(256)) + JL(LabelRef("col_loop_b0")) + + VZEROUPPER() + RET() +} + +func xor32x128() { + // xor32x128 function + TEXT("xor32x128", NOSPLIT, "func(x, y, out *byte)") + Doc("out = x xor y") + x := Mem{Base: Load(Param("x"), GP64())} + y := Mem{Base: Load(Param("y"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + + X := XMM() + Y := XMM() + + count := zero() + Label("xor32_loop") + MOVOU(x.Idx(count, 1), X) + MOVOU(y.Idx(count, 1), Y) + PXOR(X, Y) + MOVOU(Y, out.Idx(count, 1)) + ADDQ(U8(16), count) + CMPQ(count, U32(512)) + JL(LabelRef("xor32_loop")) + + RET() +} + +func xor32x128avx() { + // xor32x128 function + TEXT("xor32x128avx", NOSPLIT, "func(len int, x, y, out *byte)") + Doc("out = x xor y") + x := Mem{Base: Load(Param("x"), GP64())} + y := Mem{Base: Load(Param("y"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + len := Load(Param("len"), GP64()) + + X := YMM() + Y := YMM() + + count := zero() + Label("xor32_loop_avx") + VMOVDQU(x.Idx(count, 1), X) + VMOVDQU(y.Idx(count, 1), Y) + VPXOR(X, Y, Y) + VMOVDQU(Y, out.Idx(count, 1)) + ADDQ(U8(32), count) + CMPQ(count, len) + JL(LabelRef("xor32_loop_avx")) + + VZEROUPPER() + RET() +} + +func xorRoundKey128() { + // xorRoundKey128 function + TEXT("xorRoundKey128", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)") + Doc("xor x1, x2, x3 with round key, 16 bytes per bit") + + x := Load(Param("rk"), GP32()) + x1 := Mem{Base: Load(Param("x1"), GP64())} + x2 := Mem{Base: Load(Param("x2"), GP64())} + x3 := Mem{Base: Load(Param("x3"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} + + ret := XMM() + one := XMM() + PCMPEQB(one, one) + + y := GP32() + + count := GP64() + XORQ(count, count) + Comment("Handle first byte") + MOVL(U32(0x01000000), y) + Label("rk_loop_1") + MOVOU(x1.Idx(count, 1), ret) + PXOR(x2.Idx(count, 1), ret) + PXOR(x3.Idx(count, 1), ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_1_c")) + PXOR(one, ret) + Label("rk_loop_1_c") + MOVOU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(16), count) + CMPQ(count, U32(128)) + JL(LabelRef("rk_loop_1")) Comment("Handle second byte") MOVL(U32(0x00010000), y) @@ -1142,868 +1937,1616 @@ func xorRoundKey128() { PXOR(x2.Idx(count, 1), ret) PXOR(x3.Idx(count, 1), ret) TESTL(x, y) - JZ(LabelRef("rk_loop_2_c")) + JZ(LabelRef("rk_loop_2_c")) + PXOR(one, ret) + Label("rk_loop_2_c") + MOVOU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(16), count) + CMPQ(count, U32(256)) + JL(LabelRef("rk_loop_2")) + + Comment("Handle third byte") + MOVL(U32(0x00000100), y) + Label("rk_loop_3") + MOVOU(x1.Idx(count, 1), ret) + PXOR(x2.Idx(count, 1), ret) + PXOR(x3.Idx(count, 1), ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_3_c")) + PXOR(one, ret) + Label("rk_loop_3_c") + MOVOU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(16), count) + CMPQ(count, U32(384)) + JL(LabelRef("rk_loop_3")) + + Comment("Handle last byte") + MOVL(U32(0x00000001), y) + Label("rk_loop_4") + MOVOU(x1.Idx(count, 1), ret) + PXOR(x2.Idx(count, 1), ret) + PXOR(x3.Idx(count, 1), ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_4_c")) PXOR(one, ret) - Label("rk_loop_2_c") + Label("rk_loop_4_c") MOVOU(ret, out.Idx(count, 1)) ROLL(U8(1), y) ADDQ(U8(16), count) - CMPQ(count, U32(256)) - JL(LabelRef("rk_loop_2")) + CMPQ(count, U32(512)) + JL(LabelRef("rk_loop_4")) + + RET() +} + +func sbox128() { + // sbox128 function + TEXT("sbox128", NOSPLIT, "func(x, buffer *byte)") + Doc("sbox128, 128 bits per 'byte'") + + b := Mem{Base: Load(Param("x"), GP64())} + buffer := Mem{Base: Load(Param("buffer"), GP64())} + + Comment("f, for not operation") + f := XMM() + PCMPEQB(f, f) + + Comment("Start input function") + Comment("t1=b7 ^ b5") + t1 := XMM() + MOVOU(b.Offset(7*16), t1) + PXOR(b.Offset(5*16), t1) + + t2, t7, t8 := XMM(), XMM(), XMM() + MOVOU(b.Offset(1*16), t2) + MOVOU(t2, t7) + MOVOU(t2, t8) + Comment("store m6=b1") + MOVOU(t2, buffer.Offset((8+6)*16)) // m6 + Comment("t2=b5 ^ b1") + PXOR(b.Offset(5*16), t2) + PANDN(f, t2) + + t3, t4 := XMM(), XMM() + Comment("store g5=^b0") + MOVOU(b, t3) + MOVOU(t3, t4) + PANDN(f, t4) + MOVOU(t4, buffer.Offset(5*16)) // g5 + Comment("t3=^(b0 ^ t2)") + PXOR(t2, t3) + PANDN(f, t3) + + Comment("t4=b6 ^ b2") + t12 := XMM() + MOVOU(b.Offset(6*16), t4) + MOVOU(t4, t12) + PXOR(b.Offset(2*16), t4) + + Comment("t5=b3 ^ t3") + t5, t11 := XMM(), XMM() + MOVOU(b.Offset(3*16), t5) + MOVOU(t5, t11) + PXOR(t3, t5) + + Comment("t6=b4 ^ t1") + t6 := XMM() + MOVOU(b.Offset(4*16), t6) + PXOR(t1, t6) + + Comment("t7=b1 ^ t5") + PXOR(t5, t7) + Comment("t8=b1 ^ t4") + PXOR(t4, t8) + + Comment("t9=t6 ^ t8") + t9 := XMM() + MOVOU(t6, t9) + PXOR(t8, t9) + Comment("store m8") + MOVOU(t9, buffer.Offset((8+8)*16)) // m8 + Comment("store g1") + MOVOU(t7, buffer.Offset(1*16)) // g1 + Comment("store g3") + MOVOU(t5, buffer.Offset(3*16)) // g3 + Comment("store g4") + MOVOU(t2, buffer.Offset(4*16)) // g4 + Comment("store m0") + MOVOU(t6, buffer.Offset((8+0)*16)) // m0 + Comment("store m1") + MOVOU(t3, buffer.Offset((8+1)*16)) // m1 + Comment("store m2") + MOVOU(t8, buffer.Offset((8+2)*16)) // m2 + Comment("store m4") + MOVOU(t4, buffer.Offset((8+4)*16)) // m4 + + Comment("t11=^(b3 ^ t1)") + PXOR(t1, t11) + PANDN(f, t11) + Comment("store m5, can reuse t1 now") + MOVOU(t11, buffer.Offset((8+5)*16)) // m5 + + Comment("t12=^(b6 ^ t9)") + PXOR(t9, t12) + PANDN(f, t12) + Comment("store m9, can reuse t7 t8 t9 now") + MOVOU(t12, buffer.Offset((8+9)*16)) // m9 + + Comment("t10=t6 ^ t7") + t10 := t7 + PXOR(t6, t10) + Comment("store g0, can reuse t6 now") + MOVOU(t10, buffer) // g0 + + Comment("t13=t4 ^ t10") + t13 := t10 + PXOR(t4, t13) + Comment("store g2, can reuse t4 now") + MOVOU(t13, buffer.Offset(2*16)) // g2 + + Comment("t14=t2 ^ t11") + t14 := t1 + MOVOU(t11, t14) + PXOR(t2, t14) + Comment("store g6, can reuse t2 now") + MOVOU(t14, buffer.Offset(6*16)) // g6 + + Comment("t15=t12^t14") + t15 := t14 + PXOR(t12, t15) + Comment("store g7") + MOVOU(t15, buffer.Offset(7*16)) // g7 + + Comment("t16=t3 ^ t12") + t16 := t12 + PXOR(t3, t16) + Comment("store m3") + MOVOU(t16, buffer.Offset((8+3)*16)) // m3 + + Comment("t17=t11 ^ t16") + t17 := t16 + PXOR(t11, t17) + Comment("store m7") + MOVOU(t17, buffer.Offset((8+7)*16)) // m7 + + Comment("Start top function") + Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8") + // t1 = g7 + // t2 = g4 + // t3 = m1 + // t4 = m4 + // t5 = g3 + // t6 = m0 + // t7 = g2 + // t8 = m2 + // t9 = m8 + // t11 = m5 + // t12 = m7 + Comment("t2=m0 & m1") + PAND(t6, t3) // t2 := t3 + + Comment("t3=g0 & g4") + PAND(buffer, t2) // t3 := t2 + + Comment("t4=g3 & g7") + MOVOU(t1, t6) + PAND(t5, t1) // t4 := t1 + + Comment("t7=g3 | g7") + POR(t6, t5) // t7 := t5 + + Comment("t11=m4 & m5") + PAND(t4, t11) // t11 + + MOVOU(buffer.Offset((8+3)*16), t4) // t4 = m3 + MOVOU(t4, t6) + Comment("t10=m3 & m2") + PAND(t8, t6) // t10 := t6 + Comment("t12=m3 | m2") + POR(t8, t4) // t12 := t4 + + Comment("t6=g6 | g2") + POR(buffer.Offset(6*16), t7) // t6 := t7 + + Comment("t9=m6 | m7") + POR(buffer.Offset((8+6)*16), t12) // t9 := t12 + + t10 = XMM() + MOVOU(buffer.Offset((8+9)*16), t8) // t8 = m9 + MOVOU(t8, t10) + + Comment("t5=m8 & m9") + PAND(t9, t8) // t5 := t8 + Comment("t8=m8 | m9") + POR(t9, t10) // t8 := t10 + + Comment("t14 = t3 ^ t2") + PXOR(t3, t2) // t14 = t3 ^ t2 + Comment("t16 = t5 ^ t14") + PXOR(t2, t8) // t16 = t5 ^ t14, can reuse t2 now + Comment("t20 = t16 ^ t7") + PXOR(t8, t5) // t20 = t16 ^ t7 + Comment("t17 = t9 ^ t10") + PXOR(t12, t6) // t17 = t9 ^ t10 + Comment("t18 = t11 ^ t12") + PXOR(t11, t4) // t18 = t11 ^ t12 + Comment("p2 = t20 ^ t18") + PXOR(t5, t4) // p2 = t20 ^ t18, can reuse t5 now + Comment("p0 = t6 ^ t16") + PXOR(t7, t8) // p0 = t6 ^ t16 + Comment("t1 = g5 & g1") + MOVOU(buffer.Offset(1*16), t2) + MOVOU(buffer.Offset(5*16), t5) + PAND(t2, t5) // t1 := t5 + Comment("t13 = t1 ^ t2") + PXOR(t5, t3) // t13 = t1 ^ t2 + Comment("t15 = t13 ^ t4") + PXOR(t1, t3) // t15 = t4 ^ t13 + Comment("t19 = t6 ^ t15") + PXOR(t3, t7) // t19 = t6 ^ t15 + Comment("p3 = t19 ^ t17") + PXOR(t6, t7) // p3 = t19 ^ t17 + Comment("p1 = t8 ^ t15") + PXOR(t10, t3) // p1 = t8 ^ t15 + + Comment("start middle function") + Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0") + + // t3 = p1 + // t4 = p2 + // t7 = p3 + // t8 = p0 + Comment("t0 = p1 & p2") + MOVOU(t3, t1) + PAND(t4, t1) // t0 := t1 + + Comment("t1 = p3 & p0") + MOVOU(t8, t2) + PAND(t7, t2) // t1 := t2 + + Comment("t2 = p0 & p2") + MOVOU(t4, t5) + PAND(t8, t5) // t2 := t5 + + Comment("t3 = p1 & p3") + MOVOU(t3, t6) + PAND(t7, t6) // t3 := t6 + + Comment("t4 = t0 & t2") + MOVOU(t1, t9) + PAND(t5, t9) // t4 := t9 + + Comment("t5 = t1 & t3") + MOVOU(t2, t10) + PXOR(t6, t10) // t5 := t10 + + Comment("t6 = t5 | p0") + POR(t10, t8) // t6 := t8 + + Comment("t7 = t2 | p3") + POR(t5, t7) // t7 + + Comment("t8 = t4 ^ t6") + PXOR(t9, t8) // l3 = t8 + + Comment("t9 = t7 ^ t3") + PXOR(t7, t6) // t9 := t6 + + Comment("t10 = t0 ^ t9") + PXOR(t1, t6) // l0 = t10 := t6 + + Comment("t11 = p2 | t5") + POR(t10, t4) // t11 := t4 + Comment("l1 = t11 ^ t1") + PXOR(t4, t2) // l1 := t2 + + Comment("t12 = p1 | t2") + POR(t5, t3) // t12 := t3 + Comment("l2 = t12 ^ t5") + PXOR(t10, t3) // l2 := t3 + + Comment("start bottom function") + Comment("current register status: t6=l0, t2=l1, t3=l2, t8=l3") + Comment("k4 = l2 ^ l3") + MOVOU(t8, t5) + PXOR(t3, t5) // k4 := t5 + + Comment("k3 = l1 ^ l3") + MOVOU(t8, t4) + PXOR(t2, t4) // k3 := t4 + + Comment("k2 = l0 ^ l2") + MOVOU(t6, t7) + PXOR(t3, t7) // k2 := t7 + + Comment("k0 = l0 ^ l1") + MOVOU(t6, t1) + PXOR(t2, t1) // k0 := t1 + + Comment("k1 = k2 ^ k3") + MOVOU(t4, t9) + PXOR(t7, t9) // k1 := t9 + + Comment("e0=(m1 & k0)") + MOVOU(buffer.Offset((8+1)*16), t10) // m1 + PAND(t1, t10) // e0 := t10 + + Comment("e1=(g5 & l1)") + MOVOU(buffer.Offset(5*16), t11) + PAND(t2, t11) // e1 := t11 + + Comment("r0=e0 ^ e1") + PXOR(t11, t10) // r0 = e0 ^ e1 + + Comment("e2=(g4 & l0)") + MOVOU(buffer.Offset(4*16), t12) + PAND(t6, t12) + + Comment("r1=e2 ^ e1") + PXOR(t12, t11) // r1 = e2 ^ e1 + + Comment("store r0 r1") + MOVOU(t10, buffer.Offset(22*16)) // in fact, we can start from 18*16 + MOVOU(t11, buffer.Offset(23*16)) + + Comment("e3=(m7 & k3)") + MOVOU(buffer.Offset((8+7)*16), t10) // m7 + PAND(t4, t10) + + Comment("e4=(m5 & k2)") + MOVOU(buffer.Offset((8+5)*16), t11) // m5 + PAND(t7, t11) + Comment("r2=e3 ^ e4") + PXOR(t11, t10) // r2 = e3 ^ e4 + + Comment("e5=(m3 & k1)") + MOVOU(buffer.Offset((8+3)*16), t12) // m3 + PAND(t9, t12) + Comment("r3=e5 ^ e4") + PXOR(t12, t11) // r3 = e5 ^ e4 + + Comment("store r2 r3") + MOVOU(t10, buffer.Offset(24*16)) + MOVOU(t11, buffer.Offset(25*16)) + + Comment("e6=(m9 & k4)") + MOVOU(buffer.Offset((8+9)*16), t10) // m9 + PAND(t5, t10) + + Comment("e7=(g7 & l3)") + MOVOU(buffer.Offset(7*16), t11) + PAND(t8, t11) + Comment("r4=e7 ^ e6") + PXOR(t11, t10) // r4 = e6 ^ e7 + + Comment("e8=(g6 & l2)") + MOVOU(buffer.Offset(6*16), t12) + PAND(t3, t12) + Comment("r5=e8 ^ e6") + PXOR(t11, t12) // r5 = e8 ^ e7 + + Comment("store r4") + MOVOU(t10, buffer.Offset(26*16)) + + Comment("e9=(m0 & k0)") + MOVOU(buffer.Offset((8+0)*16), t10) // m0 + PAND(t1, t10) // e9 := t10 + + Comment("e10=(g1 & l1)") + MOVOU(buffer.Offset(1*16), t1) + PAND(t2, t1) // e10 := t1 + + Comment("r6=e9 ^ e10") + PXOR(t1, t10) // r6 = e9 ^ e10 + + Comment("e11=(g0 & l0)") + MOVOU(buffer, t11) + PAND(t11, t6) // e11 := t6 + Comment("r7=e11 ^ e10") + PXOR(t6, t1) // r7 = e11 ^ e10 = t1 - Comment("Handle third byte") - MOVL(U32(0x00000100), y) - Label("rk_loop_3") - MOVOU(x1.Idx(count, 1), ret) - PXOR(x2.Idx(count, 1), ret) - PXOR(x3.Idx(count, 1), ret) - TESTL(x, y) - JZ(LabelRef("rk_loop_3_c")) - PXOR(one, ret) - Label("rk_loop_3_c") - MOVOU(ret, out.Idx(count, 1)) - ROLL(U8(1), y) - ADDQ(U8(16), count) - CMPQ(count, U32(384)) - JL(LabelRef("rk_loop_3")) + Comment("e12=(m6 & k3)") + MOVOU(buffer.Offset((8+6)*16), t2) // m6 + PAND(t4, t2) - Comment("Handle last byte") - MOVL(U32(0x00000001), y) - Label("rk_loop_4") - MOVOU(x1.Idx(count, 1), ret) - PXOR(x2.Idx(count, 1), ret) - PXOR(x3.Idx(count, 1), ret) - TESTL(x, y) - JZ(LabelRef("rk_loop_4_c")) - PXOR(one, ret) - Label("rk_loop_4_c") - MOVOU(ret, out.Idx(count, 1)) - ROLL(U8(1), y) - ADDQ(U8(16), count) - CMPQ(count, U32(512)) - JL(LabelRef("rk_loop_4")) + Comment("e13=(m4 & k2)") + MOVOU(buffer.Offset((8+4)*16), t6) // m4 + PAND(t7, t6) + + Comment("r8=e12 ^ e13") + PXOR(t6, t2) // r8 = e12 ^ e13 = t2 + + Comment("e14=(m2 & k1)") + MOVOU(buffer.Offset((8+2)*16), t4) // m2 + PAND(t9, t4) + Comment("r9=e14 ^ e13") + PXOR(t6, t4) // r9 = e14 ^ e13 = t4 + + Comment("e15=(m8 & k4)") + MOVOU(buffer.Offset((8+8)*16), t9) // m8 + PAND(t9, t5) + + Comment("e16=(g3 & l3)") + MOVOU(buffer.Offset(3*16), t9) + PAND(t9, t8) + Comment("r10=e15 ^ e16") + PXOR(t8, t5) // r10 = e15 ^ e16 = t5 + + Comment("e17=(g2 & l2)") + MOVOU(buffer.Offset(2*16), t11) + PAND(t11, t3) + Comment("r11=e17 ^ e16") + PXOR(t8, t3) // r11 = e17 ^ e16 = t3 + + Comment("start output function") + // t12 = r5 + // t10 = r6 + // t1 = r7 + // t2 = r8 + // t4 = r9 + // t5 = r10 + // t3 = r11 + Comment("[t1]=r7 ^ r9") + PXOR(t1, t4) // [t1] = t4 = r7 ^ r9 + + Comment("[t2]=t1 ^ r1") + MOVOU(buffer.Offset((22+1)*16), t6) // r1 + PXOR(t4, t6) // [t2] = t6 = r1 ^ [t1] + + Comment("[t3]=t2 ^ r3") + MOVOU(buffer.Offset((22+3)*16), t7) // r3 + MOVOU(t7, t8) + PXOR(t6, t7) // [t3] = t7 = r3 ^ [t2] + Comment("[t4]=r5 ^ r3") + PXOR(t12, t8) // [t4] = t8 = r5 ^ r3 + + Comment("[t5]=r4 ^ [t4]") + MOVOU(buffer.Offset((22+4)*16), t9) // r4 + MOVOU(t9, t11) + PXOR(t8, t9) // [t5] = t9 = r4 ^ t4 + Comment("[t6]=r0 ^ r4") + PXOR(buffer.Offset(22*16), t11) // [t6] = t11 = r4 ^ r0 + + Comment("[t7]=r11 ^ r7") + PXOR(t3, t1) // [t7] t1 = r7 ^ r11 + + Comment("[t8]=[t1] ^ [t4]") + PXOR(t4, t8) // t8 = t4 ^ t11 + Comment("store t8") + MOVOU(t8, b.Offset(5*16)) + + Comment("[t9]=[t1] ^ [t6]") + PXOR(t11, t4) // [t9] = t4 + Comment("store t9") + MOVOU(t4, b.Offset(2*16)) + + Comment("[t10]=r2 ^ t5") + PXOR(buffer.Offset((22+2)*16), t9) // [t10] t9 = r2 ^ [t5] + Comment("[t11]=r10 ^ r8") + PXOR(t5, t2) // [t11] = t2 + Comment("store t11") + MOVOU(t2, b.Offset(3*16)) + Comment("[t12]=^([t3] ^ [t11])") + PXOR(t7, t2) + PANDN(f, t2) // [t12] = t2 + Comment("store t12") + MOVOU(t2, b.Offset(1*16)) + Comment("[t13]=[t10] ^ [t12]") + PXOR(t2, t9) // [t13] = t9 + Comment("store t13") + MOVOU(t9, b.Offset(6*16)) + + Comment("[t14]=^([t3] ^ [t7])") + PXOR(t7, t1) + PANDN(f, t1) // [t14] + Comment("store t14") + MOVOU(t1, b.Offset(4*16)) + Comment("[t16]=[t6] ^ [t14]") + PXOR(t11, t1) // [t16] + Comment("store t16") + MOVOU(t1, b) + + Comment("[t15]=^(r10 ^ r6)") + PXOR(t10, t5) + PANDN(f, t5) + Comment("store t15") + MOVOU(t5, b.Offset(7*16)) RET() } -func sbox128() { - // sbox128 function - TEXT("sbox128", NOSPLIT, "func(x, buffer *byte)") - Doc("sbox128, 128 bits per 'byte'") +// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 +// 24 25 26 27 28 29 30 31 | 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 +// 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 +// 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 +// 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 +func l128() { + // l128 function + TEXT("l128", NOSPLIT, "func(x, buffer *byte)") + Doc("l128, 128 bits per 'byte'") b := Mem{Base: Load(Param("x"), GP64())} buffer := Mem{Base: Load(Param("buffer"), GP64())} - Comment("f, for not operation") - f := XMM() - PCMPEQB(f, f) + X0, X1, X2, X3, X4, X5, X6, X7, X8, X9 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() - Comment("Start input function") - Comment("t1=b7 ^ b5") - t1 := XMM() - MOVOU(b.Offset(7*16), t1) - PXOR(b.Offset(5*16), t1) + MOVOU(b, X0) + MOVOU(b.Offset(8*16), X1) + MOVOU(b.Offset(16*16), X2) + MOVOU(b.Offset(24*16), X3) + MOVOU(b.Offset(18*16), X5) + MOVOU(b.Offset(22*16), X6) + MOVOU(b.Offset(26*16), X7) + MOVOU(b.Offset(30*16), X8) + MOVOU(b.Offset(2*16), X9) - t2, t7, t8 := XMM(), XMM(), XMM() - MOVOU(b.Offset(1*16), t2) - MOVOU(t2, t7) - MOVOU(t2, t8) - Comment("store m6=b1") - MOVOU(t2, buffer.Offset((8+6)*16)) // m6 - Comment("t2=b5 ^ b1") - PXOR(b.Offset(5*16), t2) - PANDN(f, t2) + Comment("0=0^24^14^22^30") + MOVOU(X0, X4) + PXOR(X3, X4) + PXOR(b.Offset(14*16), X4) + PXOR(X6, X4) + PXOR(X8, X4) + MOVOU(X4, buffer) - t3, t4 := XMM(), XMM() - Comment("store g5=^b0") - MOVOU(b, t3) - MOVOU(t3, t4) - PANDN(f, t4) - MOVOU(t4, buffer.Offset(5*16)) // g5 - Comment("t3=^(b0 ^ t2)") - PXOR(t2, t3) - PANDN(f, t3) + Comment("2=0^2^26^8^16") + MOVOU(X0, X4) + PXOR(X9, X4) + PXOR(X7, X4) + PXOR(X1, X4) + PXOR(X2, X4) + MOVOU(X4, buffer.Offset(2*16)) - Comment("t4=b6 ^ b2") - t12 := XMM() - MOVOU(b.Offset(6*16), t4) - MOVOU(t4, t12) - PXOR(b.Offset(2*16), t4) + Comment("8=0^8^22^30^6") + MOVOU(X0, X4) + PXOR(X1, X4) + PXOR(X6, X4) + PXOR(X8, X4) + PXOR(b.Offset(6*16), X4) + MOVOU(X4, buffer.Offset(8*16)) + + Comment("18=0^18^10^16^24") + MOVOU(X0, X4) + PXOR(X5, X4) + PXOR(b.Offset(10*16), X4) + PXOR(X2, X4) + PXOR(X3, X4) + MOVOU(X4, buffer.Offset(18*16)) + + Comment("26=0^26^18^24^8") + PXOR(X1, X0) + PXOR(X7, X0) + PXOR(X5, X0) + PXOR(X3, X0) + MOVOU(X0, buffer.Offset(26*16)) + + Comment("10=10^2^8^16^24") + MOVOU(X9, X4) + PXOR(b.Offset(10*16), X4) + PXOR(X1, X4) + PXOR(X2, X4) + PXOR(X3, X4) + MOVOU(X4, buffer.Offset(10*16)) + + MOVOU(b.Offset(6*16), X0) + MOVOU(b.Offset(14*16), X5) + Comment("16=16^8^30^6^14") + PXOR(X2, X1) + PXOR(X8, X1) + PXOR(X0, X1) + PXOR(X5, X1) + MOVOU(X1, buffer.Offset(16*16)) + + Comment("24=24^16^6^14^22") + PXOR(X3, X2) + PXOR(X0, X2) + PXOR(X5, X2) + PXOR(X6, X2) + MOVOU(X2, buffer.Offset(24*16)) + + MOVOU(b.Offset(4*16), X1) + MOVOU(b.Offset(10*16), X2) + MOVOU(b.Offset(12*16), X3) + // X0=6, X1=4, X9=X4=2, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30 + Comment("4=4^28^2^10^18") + MOVOU(X9, X4) + PXOR(X1, X4) + PXOR(X2, X4) + PXOR(b.Offset(18*16), X4) + PXOR(b.Offset(28*16), X4) + MOVOU(X4, buffer.Offset(4*16)) + + Comment("20=20^12^18^26^2") + MOVOU(X9, X4) + PXOR(b.Offset(20*16), X4) + PXOR(X3, X4) + PXOR(b.Offset(18*16), X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(20*16)) + + Comment("28=28^20^26^2^10") + PXOR(b.Offset(28*16), X9) + PXOR(b.Offset(20*16), X9) + PXOR(X7, X9) + PXOR(X2, X9) + MOVOU(X9, buffer.Offset(28*16)) + + MOVOU(b.Offset(20*16), X9) + // X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30 + + Comment("6=6^30^4^12^20") + MOVOU(X1, X4) + PXOR(X0, X4) + PXOR(X3, X4) + PXOR(X8, X4) + PXOR(X9, X4) + MOVOU(X4, buffer.Offset(6*16)) + + Comment("12=12^4^10^18^26") + MOVOU(X1, X4) + PXOR(X3, X4) + PXOR(X2, X4) + PXOR(b.Offset(18*16), X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(12*16)) + + MOVOU(b.Offset(28*16), X7) + // X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=28, X8=30 + Comment("22=22^14^20^28^4") + MOVOU(X1, X4) + PXOR(X5, X4) + PXOR(X6, X4) + PXOR(X9, X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(22*16)) + + Comment("30=30^22^28^4^12") + PXOR(X8, X1) + PXOR(X6, X1) + PXOR(X3, X1) + PXOR(X7, X1) + MOVOU(X1, buffer.Offset(30*16)) + + Comment("14=14^6^12^20^28") + PXOR(X3, X0) + PXOR(X7, X0) + PXOR(X9, X0) + PXOR(X5, X0) + MOVOU(X0, buffer.Offset(14*16)) - Comment("t5=b3 ^ t3") - t5, t11 := XMM(), XMM() - MOVOU(b.Offset(3*16), t5) - MOVOU(t5, t11) - PXOR(t3, t5) + MOVOU(b.Offset(1*16), X0) + MOVOU(b.Offset(9*16), X1) + MOVOU(b.Offset(17*16), X2) + MOVOU(b.Offset(25*16), X3) + MOVOU(b.Offset(19*16), X5) + MOVOU(b.Offset(23*16), X6) + MOVOU(b.Offset(27*16), X7) + MOVOU(b.Offset(31*16), X8) + MOVOU(b.Offset(3*16), X9) - Comment("t6=b4 ^ t1") - t6 := XMM() - MOVOU(b.Offset(4*16), t6) - PXOR(t1, t6) + Comment("1=1^25^15^23^31") + MOVOU(X0, X4) + PXOR(X3, X4) + PXOR(b.Offset(15*16), X4) + PXOR(X6, X4) + PXOR(X8, X4) + MOVOU(X4, buffer.Offset(1*16)) - Comment("t7=b1 ^ t5") - PXOR(t5, t7) - Comment("t8=b1 ^ t4") - PXOR(t4, t8) + Comment("3=3^27^1^9^17") + MOVOU(X0, X4) + PXOR(X9, X4) + PXOR(X7, X4) + PXOR(X1, X4) + PXOR(X2, X4) + MOVOU(X4, buffer.Offset(3*16)) - Comment("t9=t6 ^ t8") - t9 := XMM() - MOVOU(t6, t9) - PXOR(t8, t9) - Comment("store m8") - MOVOU(t9, buffer.Offset((8+8)*16)) // m8 - Comment("store g1") - MOVOU(t7, buffer.Offset(1*16)) // g1 - Comment("store g3") - MOVOU(t5, buffer.Offset(3*16)) // g3 - Comment("store g4") - MOVOU(t2, buffer.Offset(4*16)) // g4 - Comment("store m0") - MOVOU(t6, buffer.Offset((8+0)*16)) // m0 - Comment("store m1") - MOVOU(t3, buffer.Offset((8+1)*16)) // m1 - Comment("store m2") - MOVOU(t8, buffer.Offset((8+2)*16)) // m2 - Comment("store m4") - MOVOU(t4, buffer.Offset((8+4)*16)) // m4 + Comment("9=9^1^23^31^7") + MOVOU(X0, X4) + PXOR(X1, X4) + PXOR(X6, X4) + PXOR(X8, X4) + PXOR(b.Offset(7*16), X4) + MOVOU(X4, buffer.Offset(9*16)) - Comment("t11=^(b3 ^ t1)") - PXOR(t1, t11) - PANDN(f, t11) - Comment("store m5, can reuse t1 now") - MOVOU(t11, buffer.Offset((8+5)*16)) // m5 + Comment("19=1^19^11^17^25") + MOVOU(X0, X4) + PXOR(X5, X4) + PXOR(b.Offset(11*16), X4) + PXOR(X2, X4) + PXOR(X3, X4) + MOVOU(X4, buffer.Offset(19*16)) - Comment("t12=^(b6 ^ t9)") - PXOR(t9, t12) - PANDN(f, t12) - Comment("store m9, can reuse t7 t8 t9 now") - MOVOU(t12, buffer.Offset((8+9)*16)) // m9 + Comment("27=1^27^19^25^9") + PXOR(X1, X0) + PXOR(X7, X0) + PXOR(X5, X0) + PXOR(X3, X0) + MOVOU(X0, buffer.Offset(27*16)) - Comment("t10=t6 ^ t7") - t10 := t7 - PXOR(t6, t10) - Comment("store g0, can reuse t6 now") - MOVOU(t10, buffer) // g0 + Comment("11=11^3^9^17^25") + MOVOU(X9, X4) + PXOR(b.Offset(11*16), X4) + PXOR(X1, X4) + PXOR(X2, X4) + PXOR(X3, X4) + MOVOU(X4, buffer.Offset(11*16)) - Comment("t13=t4 ^ t10") - t13 := t10 - PXOR(t4, t13) - Comment("store g2, can reuse t4 now") - MOVOU(t13, buffer.Offset(2*16)) // g2 + MOVOU(b.Offset(7*16), X0) + MOVOU(b.Offset(15*16), X5) + Comment("17=17^9^31^7^15") + PXOR(X2, X1) + PXOR(X8, X1) + PXOR(X0, X1) + PXOR(X5, X1) + MOVOU(X1, buffer.Offset(17*16)) - Comment("t14=t2 ^ t11") - t14 := t1 - MOVOU(t11, t14) - PXOR(t2, t14) - Comment("store g6, can reuse t2 now") - MOVOU(t14, buffer.Offset(6*16)) // g6 + Comment("25=25^17^7^15^23") + PXOR(X3, X2) + PXOR(X0, X2) + PXOR(X5, X2) + PXOR(X6, X2) + MOVOU(X2, buffer.Offset(25*16)) - Comment("t15=t12^t14") - t15 := t14 - PXOR(t12, t15) - Comment("store g7") - MOVOU(t15, buffer.Offset(7*16)) // g7 + MOVOU(b.Offset(5*16), X1) + MOVOU(b.Offset(11*16), X2) + MOVOU(b.Offset(13*16), X3) + // X0=7, X1=5, X9=X4=3, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31 + Comment("5=5^29^3^11^19") + MOVOU(X9, X4) + PXOR(X1, X4) + PXOR(X2, X4) + PXOR(b.Offset(19*16), X4) + PXOR(b.Offset(29*16), X4) + MOVOU(X4, buffer.Offset(5*16)) - Comment("t16=t3 ^ t12") - t16 := t12 - PXOR(t3, t16) - Comment("store m3") - MOVOU(t16, buffer.Offset((8+3)*16)) // m3 + Comment("21=21^13^19^27^3") + MOVOU(X9, X4) + PXOR(b.Offset(21*16), X4) + PXOR(X3, X4) + PXOR(b.Offset(19*16), X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(21*16)) - Comment("t17=t11 ^ t16") - t17 := t16 - PXOR(t11, t17) - Comment("store m7") - MOVOU(t17, buffer.Offset((8+7)*16)) // m7 + Comment("29=29^21^27^3^11") + PXOR(b.Offset(29*16), X9) + PXOR(b.Offset(21*16), X9) + PXOR(X7, X9) + PXOR(X2, X9) + MOVOU(X9, buffer.Offset(29*16)) - Comment("Start top function") - Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8") - // t1 = g7 - // t2 = g4 - // t3 = m1 - // t4 = m4 - // t5 = g3 - // t6 = m0 - // t7 = g2 - // t8 = m2 - // t9 = m8 - // t11 = m5 - // t12 = m7 - Comment("t2=^(m0 & m1)") - PAND(t6, t3) - PANDN(f, t3) // t2 + MOVOU(b.Offset(21*16), X9) + // X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31 - Comment("t3=^(g0 & g4)") - PAND(buffer, t2) - PANDN(f, t2) // t3 + Comment("7=7^31^5^13^21") + MOVOU(X1, X4) + PXOR(X0, X4) + PXOR(X3, X4) + PXOR(X8, X4) + PXOR(X9, X4) + MOVOU(X4, buffer.Offset(7*16)) - Comment("t4=^(g3 & g7)") - MOVOU(t1, t6) - PAND(t5, t1) - PANDN(f, t1) // t4 + Comment("13=13^5^11^19^27") + MOVOU(X1, X4) + PXOR(X3, X4) + PXOR(X2, X4) + PXOR(b.Offset(19*16), X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(13*16)) - Comment("t7=^(g3 | g7)") - POR(t6, t5) - PANDN(f, t5) // t7 + MOVOU(b.Offset(29*16), X7) + // X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=29, X8=31 + Comment("23=23^15^21^29^5") + MOVOU(X1, X4) + PXOR(X5, X4) + PXOR(X6, X4) + PXOR(X9, X4) + PXOR(X7, X4) + MOVOU(X4, buffer.Offset(23*16)) - Comment("t11=^(m4 & m5)") - PAND(t4, t11) - PANDN(f, t11) // t11 + Comment("31=31^23^29^5^13") + PXOR(X8, X1) + PXOR(X6, X1) + PXOR(X3, X1) + PXOR(X7, X1) + MOVOU(X1, buffer.Offset(31*16)) - MOVOU(buffer.Offset((8+3)*16), t4) // t4 = m3 - MOVOU(t4, t6) - Comment("t10=^( m3 & m2 )") - PAND(t8, t6) - PANDN(f, t6) // t10 - Comment("t12=^( m3 | m2 )") - POR(t8, t4) - PANDN(f, t4) // t12 + Comment("15=15^7^13^21^29") + PXOR(X3, X0) + PXOR(X7, X0) + PXOR(X9, X0) + PXOR(X5, X0) + MOVOU(X0, buffer.Offset(15*16)) - Comment("t6=^( g6 | g2 )") - POR(buffer.Offset(6*16), t7) - PANDN(f, t7) // t6 + RET() +} - Comment("t9=^( m6 | m7 )") - POR(buffer.Offset((8+6)*16), t12) - PANDN(f, t12) // t9 +// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 +// 24 25 26 27 28 29 30 31 | 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 +// 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 +// 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 +// 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 +func l256() { + // l256 function + TEXT("l256", NOSPLIT, "func(x, buffer *byte)") + Doc("l256, 256 bits per 'byte'") - t10 = XMM() - MOVOU(buffer.Offset((8+9)*16), t8) // t8 = m9 - MOVOU(t8, t10) + b := Mem{Base: Load(Param("x"), GP64())} + buffer := Mem{Base: Load(Param("buffer"), GP64())} - Comment("t5=^( m8 & m9 )") - PAND(t9, t8) - PANDN(f, t8) // t5 - Comment("t8=^( m8 | m9 )") - POR(t9, t10) - PANDN(f, t10) // t8 + y0, y1, y2, y3, y4, y5, y6, y7, y8, y9 := YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM() - Comment("t14 = t3 ^ t2") - PXOR(t3, t2) // t14 = t3 ^ t2 - Comment("t16 = t5 ^ t14") - PXOR(t2, t8) // t16 = t5 ^ t14, can reuse t2 now - Comment("t20 = t16 ^ t7") - PXOR(t8, t5) // t20 = t16 ^ t7 - Comment("t17 = t9 ^ t10") - PXOR(t12, t6) // t17 = t9 ^ t10 - Comment("t18 = t11 ^ t12") - PXOR(t11, t4) // t18 = t11 ^ t12 - Comment("p2 = t20 ^ t18") - PXOR(t5, t4) // p2 = t20 ^ t18, can reuse t5 now - Comment("p0 = t6 ^ t16") - PXOR(t7, t8) // p0 = t6 ^ t16 - Comment("t1 = ^(g5 & g1)") - MOVOU(buffer.Offset(1*16), t2) - MOVOU(buffer.Offset(5*16), t5) - PAND(t2, t5) - PANDN(f, t5) // t1 - Comment("t13 = t1 ^ t2") - PXOR(t5, t3) // t13 = t1 ^ t2 - Comment("t15 = t13 ^ t4") - PXOR(t1, t3) // t15 = t4 ^ t13 - Comment("t19 = t6 ^ t15") - PXOR(t3, t7) // t19 = t6 ^ t15 - Comment("p3 = t19 ^ t17") - PXOR(t6, t7) // p3 = t19 ^ t17 - Comment("p1 = t8 ^ t15") - PXOR(t10, t3) // p1 = t8 ^ t15 + VMOVDQU(b, y0) + VMOVDQU(b.Offset(8*32), y1) + VMOVDQU(b.Offset(16*32), y2) + VMOVDQU(b.Offset(24*32), y3) + VMOVDQU(b.Offset(18*32), y5) + VMOVDQU(b.Offset(22*32), y6) + VMOVDQU(b.Offset(26*32), y7) + VMOVDQU(b.Offset(30*32), y8) + VMOVDQU(b.Offset(2*32), y9) + + Comment("0=0^24^14^22^30") + VPXOR(y3, y0, y4) + VPXOR(b.Offset(14*32), y4, y4) + VPXOR(y6, y4, y4) + VPXOR(y8, y4, y4) + VMOVDQU(y4, buffer) - Comment("start middle function") - Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0") + Comment("2=0^2^26^8^16") + VPXOR(y9, y0, y4) + VPXOR(y7, y4, y4) + VPXOR(y1, y4, y4) + VPXOR(y2, y4, y4) + VMOVDQU(y4, buffer.Offset(2*32)) - // t3 = p1 - // t4 = p2 - // t7 = p3 - // t8 = p0 - Comment("t1 = ^(p3 & p0)") - MOVOU(t8, t1) - PAND(t7, t1) - PANDN(f, t1) // t1 = ^(p3 & p0) - - Comment("t2 = ^(t1 | p2)") - MOVOU(t4, t2) - POR(t1, t2) - PANDN(f, t2) // t2 = ^(t1 | p2) - - Comment("t3 = ^(p2 & p0)") - MOVOU(t4, t5) // p2 - PAND(t8, t4) - PANDN(f, t4) // t4 = ^(p2 & p0) - - Comment("t4 = p1 ^ t3") - PXOR(t3, t4) // t4 = p1 ^ t4 - - Comment("t5 = ^(p2 | t4)") - MOVOU(t5, t9) // p2 - POR(t4, t5) - PANDN(f, t5) // t5 = ^(p2 | t4) - - Comment("t6 = ^(p1 & t4)") - MOVOU(t3, t6) // p1 - PAND(t4, t6) - PANDN(f, t6) // t6 = ^(p1 & t4) - - Comment("t7 = ^(p3 | t4)") - MOVOU(t7, t11) // p3 - POR(t4, t7) - PANDN(f, t7) // t7 = ^(p3 | t4) - - Comment("t8 = ^(t7 | t2)") - MOVOU(t8, t12) // p0 - MOVOU(t7, t8) - POR(t2, t8) - PANDN(f, t8) // t8 = ^(t7 | t2) + Comment("8=0^8^22^30^6") + VPXOR(y1, y0, y4) + VPXOR(y6, y4, y4) + VPXOR(y8, y4, y4) + VPXOR(b.Offset(6*32), y4, y4) + VMOVDQU(y4, buffer.Offset(8*32)) - Comment("t9 = ^(t7 ^ t5)") - PXOR(t5, t7) - PANDN(f, t7) // t7 = ^(t5 ^ t7) + Comment("18=0^18^10^16^24") + VPXOR(y5, y0, y4) + VPXOR(b.Offset(10*32), y4, y4) + VPXOR(y2, y4, y4) + VPXOR(y3, y4, y4) + VMOVDQU(y4, buffer.Offset(18*32)) - Comment("t10 = ^(t9 ^ p3)") - PXOR(t7, t11) - PANDN(f, t11) // l0 = t11 = ^(t7 & p3) + Comment("26=0^26^18^24^8") + VPXOR(y1, y0, y0) + VPXOR(y7, y0, y0) + VPXOR(y5, y0, y0) + VPXOR(y3, y0, y0) + VMOVDQU(y0, buffer.Offset(26*32)) - Comment("t11 = ^(t6 & t8)") - PAND(t8, t6) - PANDN(f, t6) // l2 = t6 = ^(t6 & t8) + Comment("10=10^2^8^16^24") + VPXOR(b.Offset(10*32), y9, y4) + VPXOR(y1, y4, y4) + VPXOR(y2, y4, y4) + VPXOR(y3, y4, y4) + VMOVDQU(y4, buffer.Offset(10*32)) + + VMOVDQU(b.Offset(6*32), y0) + VMOVDQU(b.Offset(14*32), y5) + Comment("16=16^8^30^6^14") + VPXOR(y2, y1, y1) + VPXOR(y8, y1, y1) + VPXOR(y0, y1, y1) + VPXOR(y5, y1, y1) + VMOVDQU(y1, buffer.Offset(16*32)) - Comment("t12 = ^(p1 & t8)") - PAND(t8, t3) - PANDN(f, t3) // t3 = ^(t8 & p1) + Comment("24=24^16^6^14^22") + VPXOR(y3, y2, y2) + VPXOR(y0, y2, y2) + VPXOR(y5, y2, y2) + VPXOR(y6, y2, y2) + VMOVDQU(y2, buffer.Offset(24*32)) + + VMOVDQU(b.Offset(4*32), y1) + VMOVDQU(b.Offset(10*32), y2) + VMOVDQU(b.Offset(12*32), y3) + // y0=6, y1=4, y9=y4=2, y2=10, y3=12, y5=14, y6=22, y7=26, y8=30 + Comment("4=4^28^2^10^18") + VPXOR(y1, y9, y4) + VPXOR(y2, y4, y4) + VPXOR(b.Offset(18*32), y4, y4) + VPXOR(b.Offset(28*32), y4, y4) + VMOVDQU(y4, buffer.Offset(4*32)) - Comment("t13 = ^(t12 ^ p0)") - PXOR(t3, t12) - PANDN(f, t12) // l3 = t12 = ^(p0 ^ t3) + Comment("20=20^12^18^26^2") + VPXOR(b.Offset(20*32), y9, y4) + VPXOR(y3, y4, y4) + VPXOR(b.Offset(18*32), y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(20*32)) - Comment("t14 = ^(t1 & p2)") - PAND(t1, t9) - PANDN(f, t9) // t14 + Comment("28=28^20^26^2^10") + VPXOR(b.Offset(28*32), y9, y9) + VPXOR(b.Offset(20*32), y9, y9) + VPXOR(y7, y9, y9) + VPXOR(y2, y9, y9) + VMOVDQU(y9, buffer.Offset(28*32)) - Comment("t15 = ^(t14 & t9)") - PAND(t9, t7) - PANDN(f, t7) // l1 + VMOVDQU(b.Offset(20*32), y9) + // y0=6, y1=4, y9=20, y2=10, y3=12, y5=14, y6=22, y7=26, y8=30 - Comment("start bottom function") - Comment("current register status: t11=l0, t7=l1, t6=l2, t12=l3") - Comment("k4 = l2 ^ l3") - MOVOU(t12, t5) - PXOR(t6, t5) // k4 = l2 ^ l3 - Comment("k3 = l1 ^ l3") - MOVOU(t12, t4) - PXOR(t7, t4) // k3 = l1 ^ l3 - Comment("k2 = l0 ^ l2") - MOVOU(t6, t3) - PXOR(t11, t3) // k2 = l0 ^ l2 - Comment("k0 = l0 ^ l1") - MOVOU(t7, t1) - PXOR(t11, t1) // k0 = l0 ^ l1 - Comment("k1 = k2 ^ k3") - MOVOU(t4, t2) - PXOR(t3, t2) // k1 = k2 ^ k3 + Comment("6=6^30^4^12^20") + VPXOR(y0, y1, y4) + VPXOR(y3, y4, y4) + VPXOR(y8, y4, y4) + VPXOR(y9, y4, y4) + VMOVDQU(y4, buffer.Offset(6*32)) - Comment("e0=^(m1 & k0)") - MOVOU(buffer.Offset((8+1)*16), t8) // m1 - PAND(t1, t8) - PANDN(f, t8) // e0 + Comment("12=12^4^10^18^26") + VPXOR(y3, y1, y4) + VPXOR(y2, y4, y4) + VPXOR(b.Offset(18*32), y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(12*32)) + + VMOVDQU(b.Offset(28*32), y7) + // y0=6, y1=4, y9=20, y2=10, y3=12, y5=14, y6=22, y7=28, y8=30 + Comment("22=22^14^20^28^4") + VPXOR(y5, y1, y4) + VPXOR(y6, y4, y4) + VPXOR(y9, y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(22*32)) - Comment("e1=^(g5 & l1)") - MOVOU(buffer.Offset(5*16), t9) - PAND(t7, t9) - PANDN(f, t9) // e1 + Comment("30=30^22^28^4^12") + VPXOR(y8, y1, y1) + VPXOR(y6, y1, y1) + VPXOR(y3, y1, y1) + VPXOR(y7, y1, y1) + VMOVDQU(y1, buffer.Offset(30*32)) - Comment("r0=e0 ^ e1") - PXOR(t9, t8) // r0 = e0 ^ e1 + Comment("14=14^6^12^20^28") + VPXOR(y3, y0, y0) + VPXOR(y7, y0, y0) + VPXOR(y9, y0, y0) + VPXOR(y5, y0, y0) + VMOVDQU(y0, buffer.Offset(14*32)) + + VMOVDQU(b.Offset(1*32), y0) + VMOVDQU(b.Offset(9*32), y1) + VMOVDQU(b.Offset(17*32), y2) + VMOVDQU(b.Offset(25*32), y3) + VMOVDQU(b.Offset(19*32), y5) + VMOVDQU(b.Offset(23*32), y6) + VMOVDQU(b.Offset(27*32), y7) + VMOVDQU(b.Offset(31*32), y8) + VMOVDQU(b.Offset(3*32), y9) - Comment("e2=^(g4 & l0)") - MOVOU(buffer.Offset(4*16), t10) - PAND(t11, t10) - PANDN(f, t10) // e2 - Comment("r1=e2 ^ e1") - PXOR(t10, t9) // r1 = e2 ^ e1 + Comment("1=1^25^15^23^31") + VPXOR(y3, y0, y4) + VPXOR(b.Offset(15*32), y4, y4) + VPXOR(y6, y4, y4) + VPXOR(y8, y4, y4) + VMOVDQU(y4, buffer.Offset(1*32)) - Comment("store r0 r1") - MOVOU(t8, buffer.Offset(22*16)) // in fact, we can start from 18*16 - MOVOU(t9, buffer.Offset(23*16)) - - Comment("e3=^(m7 & k3)") - MOVOU(buffer.Offset((8+7)*16), t8) // m7 - PAND(t4, t8) - PANDN(f, t8) // e3 - - Comment("e4=^(m5 & k2)") - MOVOU(buffer.Offset((8+5)*16), t9) // m5 - PAND(t3, t9) - PANDN(f, t9) // e4 - Comment("r2=e3 ^ e4") - PXOR(t9, t8) // r2 = e3 ^ e4 + Comment("3=3^27^1^9^17") + VPXOR(y9, y0, y4) + VPXOR(y7, y4, y4) + VPXOR(y1, y4, y4) + VPXOR(y2, y4, y4) + VMOVDQU(y4, buffer.Offset(3*32)) - Comment("e5=^(m3 & k1)") - MOVOU(buffer.Offset((8+3)*16), t10) // m3 - PAND(t2, t10) - PANDN(f, t10) // e5 - Comment("r3=e5 ^ e4") - PXOR(t10, t9) // r3 = e5 ^ e4 + Comment("9=9^1^23^31^7") + VPXOR(y1, y0, y4) + VPXOR(y6, y4, y4) + VPXOR(y8, y4, y4) + VPXOR(b.Offset(7*32), y4, y4) + VMOVDQU(y4, buffer.Offset(9*32)) - Comment("store r2 r3") - MOVOU(t8, buffer.Offset(24*16)) - MOVOU(t9, buffer.Offset(25*16)) - - Comment("e6=^(m9 & k4)") - MOVOU(buffer.Offset((8+9)*16), t8) // m9 - PAND(t5, t8) - PANDN(f, t8) // e6 - - Comment("e7=^(g7 & l3)") - MOVOU(buffer.Offset(7*16), t9) - PAND(t12, t9) - PANDN(f, t9) // e7 - Comment("r4=e7 ^ e6") - PXOR(t9, t8) // r4 = e6 ^ e7 + Comment("19=1^19^11^17^25") + VPXOR(y5, y0, y4) + VPXOR(b.Offset(11*32), y4, y4) + VPXOR(y2, y4, y4) + VPXOR(y3, y4, y4) + VMOVDQU(y4, buffer.Offset(19*32)) - Comment("e8=^(g6 & l2)") - MOVOU(buffer.Offset(6*16), t10) - PAND(t6, t10) - PANDN(f, t10) // e8 - Comment("r5=e8 ^ e6") - PXOR(t10, t9) // r5 = e8 ^ e7 + Comment("27=1^27^19^25^9") + VPXOR(y1, y0, y0) + VPXOR(y7, y0, y0) + VPXOR(y5, y0, y0) + VPXOR(y3, y0, y0) + VMOVDQU(y0, buffer.Offset(27*32)) - Comment("store r4 r5") - MOVOU(t8, buffer.Offset(26*16)) - MOVOU(t9, buffer.Offset(27*16)) + Comment("11=11^3^9^17^25") + VPXOR(b.Offset(11*32), y9, y4) + VPXOR(y1, y4, y4) + VPXOR(y2, y4, y4) + VPXOR(y3, y4, y4) + VMOVDQU(y4, buffer.Offset(11*32)) + + VMOVDQU(b.Offset(7*32), y0) + VMOVDQU(b.Offset(15*32), y5) + Comment("17=17^9^31^7^15") + VPXOR(y2, y1, y1) + VPXOR(y8, y1, y1) + VPXOR(y0, y1, y1) + VPXOR(y5, y1, y1) + VMOVDQU(y1, buffer.Offset(17*32)) - Comment("e9=^(m0 & k0)") - MOVOU(buffer.Offset((8+0)*16), t8) // m0 - PAND(t1, t8) - PANDN(f, t8) // e9 + Comment("25=25^17^7^15^23") + VPXOR(y3, y2, y2) + VPXOR(y0, y2, y2) + VPXOR(y5, y2, y2) + VPXOR(y6, y2, y2) + VMOVDQU(y2, buffer.Offset(25*32)) + + VMOVDQU(b.Offset(5*32), y1) + VMOVDQU(b.Offset(11*32), y2) + VMOVDQU(b.Offset(13*32), y3) + // y0=7, y1=5, y9=y4=3, y2=11, y3=13, y5=15, y6=23, y7=27, y8=31 + Comment("5=5^29^3^11^19") + VPXOR(y1, y9, y4) + VPXOR(y2, y4, y4) + VPXOR(b.Offset(19*32), y4, y4) + VPXOR(b.Offset(29*32), y4, y4) + VMOVDQU(y4, buffer.Offset(5*32)) - Comment("e10=^(g1 & l1)") - MOVOU(buffer.Offset(1*16), t1) - PAND(t7, t1) - PANDN(f, t1) // e10 + Comment("21=21^13^19^27^3") + VPXOR(b.Offset(21*32), y9, y4) + VPXOR(y3, y4, y4) + VPXOR(b.Offset(19*32), y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(21*32)) - Comment("r6=e9 ^ e10") - PXOR(t1, t8) // r6 = e9 ^ e10 + Comment("29=29^21^27^3^11") + VPXOR(b.Offset(29*32), y9, y9) + VPXOR(b.Offset(21*32), y9, y9) + VPXOR(y7, y9, y9) + VPXOR(y2, y9, y9) + VMOVDQU(y9, buffer.Offset(29*32)) - Comment("e11=^(g0 & l0)") - MOVOU(buffer, t10) - PAND(t11, t10) - PANDN(f, t10) // e11 - Comment("r7=e11 ^ e10") - PXOR(t10, t1) // r7 = e11 ^ e10 - Comment("store r6") - MOVOU(t8, buffer.Offset(28*16)) + VMOVDQU(b.Offset(21*32), y9) + // y0=7, y1=5, y9=21, y2=11, y3=13, y5=15, y6=23, y7=27, y8=31 - Comment("e12=^(m6 & k3)") - MOVOU(buffer.Offset((8+6)*16), t7) // m6 - PAND(t4, t7) - PANDN(f, t7) // e12 + Comment("7=7^31^5^13^21") + VPXOR(y0, y1, y4) + VPXOR(y3, y4, y4) + VPXOR(y8, y4, y4) + VPXOR(y9, y4, y4) + VMOVDQU(y4, buffer.Offset(7*32)) - Comment("e13=^(m4 & k2)") - MOVOU(buffer.Offset((8+4)*16), t11) // m4 - PAND(t3, t11) - PANDN(f, t11) // e13 + Comment("13=13^5^11^19^27") + VPXOR(y3, y1, y4) + VPXOR(y2, y4, y4) + VPXOR(b.Offset(19*32), y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(13*32)) + + VMOVDQU(b.Offset(29*32), y7) + // y0=7, y1=5, y9=21, y2=11, y3=13, y5=15, y6=23, y7=29, y8=31 + Comment("23=23^15^21^29^5") + VPXOR(y5, y1, y4) + VPXOR(y6, y4, y4) + VPXOR(y9, y4, y4) + VPXOR(y7, y4, y4) + VMOVDQU(y4, buffer.Offset(23*32)) - Comment("r8=e12 ^ e13") - PXOR(t11, t7) // r8 = e12 ^ e13 = t7 + Comment("31=31^23^29^5^13") + VPXOR(y8, y1, y1) + VPXOR(y6, y1, y1) + VPXOR(y3, y1, y1) + VPXOR(y7, y1, y1) + VMOVDQU(y1, buffer.Offset(31*32)) - Comment("e14=^(m2 & k1)") - MOVOU(buffer.Offset((8+2)*16), t10) // m2 - PAND(t2, t10) - PANDN(f, t10) // e14 + Comment("15=15^7^13^21^29") + VPXOR(y3, y0, y0) + VPXOR(y7, y0, y0) + VPXOR(y9, y0, y0) + VPXOR(y5, y0, y0) + VMOVDQU(y0, buffer.Offset(15*32)) - Comment("r9=e14 ^ e13") - PXOR(t10, t11) // r9 = e14 ^ e13 = t11 + VZEROUPPER() + RET() +} - Comment("e15=^(m8 & k4)") - MOVOU(buffer.Offset((8+8)*16), t8) // m8 - PAND(t5, t8) - PANDN(f, t8) // e15 +func sbox256avx2() { + // sbox256avx2 function + TEXT("sbox256avx2", NOSPLIT, "func(x, buffer *byte)") + Doc("sbox256avx2, 256 bits per 'byte'") - Comment("e16=^(g3 & l3)") - MOVOU(buffer.Offset(3*16), t9) - PAND(t12, t9) - PANDN(f, t9) // e16 - Comment("r10=e15 ^ e16") - PXOR(t9, t8) // r10 = e15 ^ e16 = t8 + b := Mem{Base: Load(Param("x"), GP64())} + buffer := Mem{Base: Load(Param("buffer"), GP64())} - Comment("e17=^(g2 & l2)") - MOVOU(buffer.Offset(2*16), t10) - PAND(t6, t10) - PANDN(f, t10) // e17 + Comment("f, for not operation") + f := YMM() + VPCMPEQB(f, f, f) - Comment("r11=e17 ^ e16") - PXOR(t10, t9) // r11 = e17 ^ e16 = t9 + Comment("Start input function") + Comment("t1=b7 ^ b5") + t1 := YMM() + VMOVDQU(b.Offset(7*32), t1) + VPXOR(b.Offset(5*32), t1, t1) - Comment("start output function") - // t1 = r7 - // t7 = r8 - // t11 = r9 - // t8 = r10 - // t9 = r11 - Comment("[t1]=r7 ^ r9") - PXOR(t1, t11) // t11 = r7 ^ r9 - Comment("t2=t1 ^ r1") - MOVOU(buffer.Offset((22+1)*16), t2) // r1 - PXOR(t11, t2) // t2 = r1 ^ t11 - Comment("t3=t2 ^ r3") - MOVOU(buffer.Offset((22+3)*16), t3) // r3 - MOVOU(t3, t4) - PXOR(t2, t3) // t3 = r3 ^ t2 - Comment("t4=r5 ^ r3") - PXOR(buffer.Offset((22+5)*16), t4) // t4 = r5 ^ r3 + t2, t7, t8 := YMM(), YMM(), YMM() + VMOVDQU(b.Offset(1*32), t8) + Comment("store m6=b1") + VMOVDQU(t8, buffer.Offset((8+6)*32)) // m6 + Comment("t2=b5 ^ b1") + VPXOR(b.Offset(5*32), t8, t2) + VPANDN(f, t2, t2) - MOVOU(buffer.Offset((22+4)*16), t5) // r4 - MOVOU(t5, t6) - Comment("t5=r4 ^ t4") - PXOR(t4, t5) // t5 = r4 ^ t4 - Comment("t6=r0 ^ t4") - PXOR(buffer.Offset(22*16), t6) // t6 = r4 ^ r0 + t3, t4 := YMM(), YMM() - Comment("[t7]=r11 ^ r7") - PXOR(t9, t1) // [t7] t1 = r7 ^ r11 - Comment("[t8]=[t1] ^ t4") - PXOR(t11, t4) // [t8] t4 = t4 ^ t11 - Comment("store t8") - MOVOU(t4, b.Offset(5*16)) - Comment("[t9]=[t1] ^ t6") - PXOR(t6, t11) // [t9] t11 = t11 ^ t6 - Comment("store t9") - MOVOU(t11, b.Offset(2*16)) + VMOVDQU(b, t4) + Comment("t3=^(b0 ^ t2)") + VPXOR(t2, t4, t3) + VPANDN(f, t3, t3) + Comment("store g5=^b0") + VPANDN(f, t4, t4) + VMOVDQU(t4, buffer.Offset(5*32)) // g5 - Comment("[t10]=r2 ^ t5") - PXOR(buffer.Offset((22+2)*16), t5) // [t10] t5 = r2 ^ t5 - Comment("[t11]=r10 ^ r8") - PXOR(t8, t7) // [t11] t7 = rr8 ^ r10 - Comment("store t11") - MOVOU(t7, b.Offset(3*16)) - Comment("[t12]=^(t3 ^ [t11])") - PXOR(t3, t7) // t7 = t3 ^ [t11] - PANDN(f, t7) // [t12] t7 = ^(t3 ^ [t11]) - Comment("store t12") - MOVOU(t7, b.Offset(1*16)) - Comment("[t13]=[t10] ^ [t12]") - PXOR(t7, t5) // [t13] t5 = [t10] ^ [t12] - Comment("store t13") - MOVOU(t5, b.Offset(6*16)) + Comment("t4=b6 ^ b2") + t12 := YMM() + VMOVDQU(b.Offset(6*32), t12) + VPXOR(b.Offset(2*32), t12, t4) - Comment("[t14]=^(t3 ^ [t7])") - PXOR(t3, t1) - PANDN(f, t1) // [t14] - Comment("store t14") - MOVOU(t1, b.Offset(4*16)) - Comment("[t16]=t6 ^ [t14]") - PXOR(t6, t1) // [t16] - Comment("store t16") - MOVOU(t1, b) + Comment("t5=b3 ^ t3") + t5, t11 := YMM(), YMM() + VMOVDQU(b.Offset(3*32), t11) + VPXOR(t3, t11, t5) - Comment("[t15]=^(r10 ^ r6)") - PXOR(buffer.Offset((22+6)*16), t8) - PANDN(f, t8) - Comment("store t15") - MOVOU(t8, b.Offset(7*16)) + Comment("t6=b4 ^ t1") + t6 := YMM() + VPXOR(b.Offset(4*32), t1, t6) - RET() -} + Comment("t7=b1 ^ t5") + VPXOR(t5, t8, t7) + Comment("t8=b1 ^ t4") + VPXOR(t4, t8, t8) -// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 -// 24 25 26 27 28 29 30 31 | 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 -// 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 -// 22 23 8 9 10 11 12 13 | 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 -// 30 31 16 17 18 19 20 21 | 6 7 24 25 26 27 28 29 | 14 15 0 1 2 3 4 5 | 22 23 8 9 10 11 12 13 -func l128() { - // l128 function - TEXT("l128", NOSPLIT, "func(x, buffer *byte)") - Doc("l128, 128 bits per 'byte'") + Comment("t9=t6 ^ t8") + t9 := YMM() + VPXOR(t8, t6, t9) + Comment("store m8") + VMOVDQU(t9, buffer.Offset((8+8)*32)) // m8 + Comment("store g1") + VMOVDQU(t7, buffer.Offset(1*32)) // g1 + Comment("store g3") + VMOVDQU(t5, buffer.Offset(3*32)) // g3 + Comment("store g4") + VMOVDQU(t2, buffer.Offset(4*32)) // g4 + Comment("store m0") + VMOVDQU(t6, buffer.Offset((8+0)*32)) // m0 + Comment("store m1") + VMOVDQU(t3, buffer.Offset((8+1)*32)) // m1 + Comment("store m2") + VMOVDQU(t8, buffer.Offset((8+2)*32)) // m2 + Comment("store m4") + VMOVDQU(t4, buffer.Offset((8+4)*32)) // m4 - b := Mem{Base: Load(Param("x"), GP64())} - buffer := Mem{Base: Load(Param("buffer"), GP64())} + Comment("t11=^(b3 ^ t1)") + VPXOR(t1, t11, t11) + VPANDN(f, t11, t11) + Comment("store m5, can reuse t1 now") + VMOVDQU(t11, buffer.Offset((8+5)*32)) // m5 - X0, X1, X2, X3, X4, X5, X6, X7, X8, X9 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + Comment("t12=^(b6 ^ t9)") + VPXOR(t9, t12, t12) + VPANDN(f, t12, t12) + Comment("store m9, can reuse t7 t8 t9 now") + VMOVDQU(t12, buffer.Offset((8+9)*32)) // m9 - MOVOU(b, X0) - MOVOU(b.Offset(8*16), X1) - MOVOU(b.Offset(16*16), X2) - MOVOU(b.Offset(24*16), X3) - MOVOU(b.Offset(18*16), X5) - MOVOU(b.Offset(22*16), X6) - MOVOU(b.Offset(26*16), X7) - MOVOU(b.Offset(30*16), X8) - MOVOU(b.Offset(2*16), X9) + Comment("t10=t6 ^ t7") + t10 := t7 + VPXOR(t6, t10, t10) + Comment("store g0, can reuse t6 now") + VMOVDQU(t10, buffer) // g0 - Comment("0=0^24^14^22^30") - MOVOU(X0, X4) - PXOR(X3, X4) - PXOR(b.Offset(14*16), X4) - PXOR(X6, X4) - PXOR(X8, X4) - MOVOU(X4, buffer) + Comment("t13=t4 ^ t10") + t13 := t10 + VPXOR(t4, t13, t13) + Comment("store g2, can reuse t4 now") + VMOVDQU(t13, buffer.Offset(2*32)) // g2 - Comment("2=0^2^26^8^16") - MOVOU(X0, X4) - PXOR(X9, X4) - PXOR(X7, X4) - PXOR(X1, X4) - PXOR(X2, X4) - MOVOU(X4, buffer.Offset(2*16)) + Comment("t14=t2 ^ t11") + t14 := t1 + VPXOR(t2, t11, t14) + Comment("store g6, can reuse t2 now") + VMOVDQU(t14, buffer.Offset(6*32)) // g6 - Comment("8=0^8^22^30^6") - MOVOU(X0, X4) - PXOR(X1, X4) - PXOR(X6, X4) - PXOR(X8, X4) - PXOR(b.Offset(6*16), X4) - MOVOU(X4, buffer.Offset(8*16)) + Comment("t15=t12^t14") + t15 := t14 + VPXOR(t12, t15, t15) + Comment("store g7") + VMOVDQU(t15, buffer.Offset(7*32)) // g7 - Comment("18=0^18^10^16^24") - MOVOU(X0, X4) - PXOR(X5, X4) - PXOR(b.Offset(10*16), X4) - PXOR(X2, X4) - PXOR(X3, X4) - MOVOU(X4, buffer.Offset(18*16)) + Comment("t16=t3 ^ t12") + t16 := t12 + VPXOR(t3, t16, t16) + Comment("store m3") + VMOVDQU(t16, buffer.Offset((8+3)*32)) // m3 - Comment("26=0^26^18^24^8") - PXOR(X1, X0) - PXOR(X7, X0) - PXOR(X5, X0) - PXOR(X3, X0) - MOVOU(X0, buffer.Offset(26*16)) + Comment("t17=t11 ^ t16") + t17 := t16 + VPXOR(t11, t17, t17) + Comment("store m7") + VMOVDQU(t17, buffer.Offset((8+7)*32)) // m7 - Comment("10=10^2^8^16^24") - MOVOU(X9, X4) - PXOR(b.Offset(10*16), X4) - PXOR(X1, X4) - PXOR(X2, X4) - PXOR(X3, X4) - MOVOU(X4, buffer.Offset(10*16)) + Comment("Start top function") + Comment("Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8") + // t1 = g7 + // t2 = g4 + // t3 = m1 + // t4 = m4 + // t5 = g3 + // t6 = m0 + // t7 = g2 + // t8 = m2 + // t9 = m8 + // t11 = m5 + // t12 = m7 + Comment("t2= (m0 & m1)") + VPAND(t6, t3, t3) // t2 - MOVOU(b.Offset(6*16), X0) - MOVOU(b.Offset(14*16), X5) - Comment("16=16^8^30^6^14") - PXOR(X2, X1) - PXOR(X8, X1) - PXOR(X0, X1) - PXOR(X5, X1) - MOVOU(X1, buffer.Offset(16*16)) + Comment("t3= (g0 & g4)") + VPAND(buffer, t2, t2) // t3 - Comment("24=24^16^6^14^22") - PXOR(X3, X2) - PXOR(X0, X2) - PXOR(X5, X2) - PXOR(X6, X2) - MOVOU(X2, buffer.Offset(24*16)) + Comment("t4= (g3 & g7)") + VPAND(t5, t1, t6) // [t4] := t6 - MOVOU(b.Offset(4*16), X1) - MOVOU(b.Offset(10*16), X2) - MOVOU(b.Offset(12*16), X3) - // X0=6, X1=4, X9=X4=2, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30 - Comment("4=4^28^2^10^18") - MOVOU(X9, X4) - PXOR(X1, X4) - PXOR(X2, X4) - PXOR(b.Offset(18*16), X4) - PXOR(b.Offset(28*16), X4) - MOVOU(X4, buffer.Offset(4*16)) + Comment("t7= (g3 | g7)") + VPOR(t1, t5, t5) // [t7] := t5 - Comment("20=20^12^18^26^2") - MOVOU(X9, X4) - PXOR(b.Offset(20*16), X4) - PXOR(X3, X4) - PXOR(b.Offset(18*16), X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(20*16)) + Comment("t11= (m4 & m5)") + VPAND(t4, t11, t11) - Comment("28=28^20^26^2^10") - PXOR(b.Offset(28*16), X9) - PXOR(b.Offset(20*16), X9) - PXOR(X7, X9) - PXOR(X2, X9) - MOVOU(X9, buffer.Offset(28*16)) + VMOVDQU(buffer.Offset((8+3)*32), t4) // t4 = m3 + Comment("t10= ( m3 & m2 )") + VPAND(t8, t4, t1) // [t10] := t1 + Comment("t12= ( m3 | m2 )") + VPOR(t8, t4, t4) //[t12] := t4 - MOVOU(b.Offset(20*16), X9) - // X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=26, X8=30 + Comment("t6= ( g6 | g2 )") + VPOR(buffer.Offset(6*32), t7, t7) // [t6] := t7 - Comment("6=6^30^4^12^20") - MOVOU(X1, X4) - PXOR(X0, X4) - PXOR(X3, X4) - PXOR(X8, X4) - PXOR(X9, X4) - MOVOU(X4, buffer.Offset(6*16)) + Comment("t9= ( m6 | m7 )") + VPOR(buffer.Offset((8+6)*32), t12, t12) // [t9] := t12 + + t10 = YMM() + VMOVDQU(buffer.Offset((8+9)*32), t10) // t10 = m9 + + Comment("t5= ( m8 & m9 )") + VPAND(t9, t10, t8) // [t5] := t8 + Comment("t8= ( m8 | m9 )") + VPOR(t9, t10, t10) // [t8] := t10 + + Comment("t14 = t3 ^ t2") + VPXOR(t3, t2, t2) // t14 = t3 ^ t2 + Comment("t16 = t5 ^ t14") + VPXOR(t2, t8, t8) // t16 = t5 ^ t14, can reuse t2 now + Comment("t20 = t16 ^ t7") + VPXOR(t8, t5, t5) // t20 = t16 ^ t7 + Comment("t17 = t9 ^ t10") + VPXOR(t12, t1, t1) // t17 = t9 ^ t10 + Comment("t18 = t11 ^ t12") + VPXOR(t11, t4, t4) // t18 = t11 ^ t12 + Comment("p2 = t20 ^ t18") + VPXOR(t5, t4, t4) // p2 = t20 ^ t18, can reuse t5 now + Comment("p0 = t6 ^ t16") + VPXOR(t7, t8, t8) // p0 = t6 ^ t16 + Comment("t1 = (g5 & g1)") + VMOVDQU(buffer.Offset(1*32), t2) + VMOVDQU(buffer.Offset(5*32), t5) + VPAND(t2, t5, t5) + Comment("t13 = t1 ^ t2") + VPXOR(t5, t3, t3) // t13 = t1 ^ t2 + Comment("t15 = t13 ^ t4") + VPXOR(t6, t3, t3) // t15 = t4 ^ t13 + Comment("t19 = t6 ^ t15") + VPXOR(t3, t7, t7) // t19 = t6 ^ t15 + Comment("p3 = t19 ^ t17") + VPXOR(t1, t7, t7) // p3 = t19 ^ t17 + Comment("p1 = t8 ^ t15") + VPXOR(t10, t3, t3) // p1 = t8 ^ t15 - Comment("12=12^4^10^18^26") - MOVOU(X1, X4) - PXOR(X3, X4) - PXOR(X2, X4) - PXOR(b.Offset(18*16), X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(12*16)) + Comment("start middle function") + Comment("current register status: t8=p0, t3=p1, t4=p2, t7=p0") - MOVOU(b.Offset(28*16), X7) - // X0=6, X1=4, X9=20, X2=10, X3=12, X5=14, X6=22, X7=28, X8=30 - Comment("22=22^14^20^28^4") - MOVOU(X1, X4) - PXOR(X5, X4) - PXOR(X6, X4) - PXOR(X9, X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(22*16)) + // t3 = p1 + // t4 = p2 + // t7 = p3 + // t8 = p0 + Comment("t0 = (p1 & p2)") + VPAND(t3, t4, t1) + Comment("t1 = (p3 & p0)") + VPAND(t7, t8, t2) + Comment("t2 = (p0 & p2)") + VPAND(t4, t8, t5) + Comment("t3 = (p1 & p3)") + VPAND(t3, t7, t6) + Comment("t4 = (t0 & t2)") + VPAND(t1, t5, t9) + Comment("t5 = (t1 ^ t3)") + VPXOR(t2, t6, t10) + Comment("t6 = (t5 | p0)") + VPOR(t10, t8, t8) + Comment("t7 = (t2 | p3)") + VPOR(t5, t7, t7) + Comment("t8 = (t4 ^ t6)") + VPXOR(t9, t8, t8) // l3 + Comment("t9 = (t7 ^ t3)") + VPXOR(t6, t7, t9) + Comment("t10 = (t0 ^ t9)") + VPXOR(t1, t9, t1) // l0 + Comment("t11 = (t5 | p2)") + VPOR(t4, t10, t4) + Comment("l1 = t11 ^ t1") + VPXOR(t2, t4, t2) // l1 + Comment("t12 = (t2 | p1)") + VPOR(t5, t3, t3) + Comment("l2 = t12 ^ t5") + VPXOR(t3, t10, t3) // l2 - Comment("30=30^22^28^4^12") - PXOR(X8, X1) - PXOR(X6, X1) - PXOR(X3, X1) - PXOR(X7, X1) - MOVOU(X1, buffer.Offset(30*16)) + Comment("start bottom function") + Comment("current register status: t1=l0, t2=l1, t3=l2, t8=l3") + Comment("k4 = l2 ^ l3") + VPXOR(t3, t8, t7) // k4 = l2 ^ l3 + Comment("k3 = l1 ^ l3") + VPXOR(t8, t2, t6) // k3 = l1 ^ l3 + Comment("k2 = l0 ^ l2") + VPXOR(t1, t3, t5) // k2 = l0 ^ l2 + Comment("k0 = l0 ^ l1") + VPXOR(t1, t2, t4) // k0 = l0 ^ l1 + Comment("k1 = k2 ^ k3") + VPXOR(t5, t6, t9) // k1 = k2 ^ k3 + + Comment("e0= (m1 & k0)") + VMOVDQU(buffer.Offset((8+1)*32), t10) // m1 + VPAND(t4, t10, t10) // e0 + Comment("e1= (g5 & l1)") + VMOVDQU(buffer.Offset(5*32), t11) + VPAND(t2, t11, t11) // e1 + Comment("r0=e0 ^ e1") + VPXOR(t11, t10, t10) // r0 = e0 ^ e1 + Comment("e2=(g4 & l0)") + VMOVDQU(buffer.Offset(4*32), t12) + VPAND(t1, t12, t12) // e2 + Comment("r1=e2 ^ e1") + VPXOR(t12, t11, t11) // r1 = e2 ^ e1 + Comment("store r0 r1") + VMOVDQU(t10, buffer.Offset(22*32)) // in fact, we can start from 18*32 + VMOVDQU(t11, buffer.Offset(23*32)) + + Comment("e3= (m7 & k3)") + VMOVDQU(buffer.Offset((8+7)*32), t10) // m7 + VPAND(t6, t10, t10) // e3 + Comment("e4= (m5 & k2)") + VMOVDQU(buffer.Offset((8+5)*32), t11) // m5 + VPAND(t5, t11, t11) + Comment("r2=e3 ^ e4") + VPXOR(t11, t10, t10) // r2 = e3 ^ e4 + Comment("e5= (m3 & k1)") + VMOVDQU(buffer.Offset((8+3)*32), t12) // m3 + VPAND(t9, t12, t12) + Comment("r3=e5 ^ e4") + VPXOR(t12, t11, t11) // r3 = e5 ^ e4 + Comment("store r2 r3") + VMOVDQU(t10, buffer.Offset(24*32)) + VMOVDQU(t11, buffer.Offset(25*32)) + + Comment("e6=(m9 & k4)") + VMOVDQU(buffer.Offset((8+9)*32), t10) // m9 + VPAND(t7, t10, t10) + Comment("e7=(g7 & l3)") + VMOVDQU(buffer.Offset(7*32), t11) + VPAND(t8, t11, t11) + Comment("r4=e7 ^ e6") + VPXOR(t11, t10, t10) // r4 = e6 ^ e7 + Comment("e8=(g6 & l2)") + VMOVDQU(buffer.Offset(6*32), t12) + VPAND(t3, t12, t12) // e8 + Comment("r5=e8 ^ e6") + VPXOR(t12, t11, t12) // r5 = e8 ^ e7 = t12 + Comment("store r4") + VMOVDQU(t10, buffer.Offset(26*32)) + + Comment("e9=(m0 & k0)") + VMOVDQU(buffer.Offset((8+0)*32), t10) // m0 + VPAND(t4, t10, t10) + Comment("e10=(g1 & l1)") + VMOVDQU(buffer.Offset(1*32), t4) + VPAND(t4, t2, t2) // e10 + Comment("r6=e9 ^ e10") + VPXOR(t10, t2, t10) // r6 = e9 ^ e10 = t10 + Comment("e11=(g0 & l0)") + VMOVDQU(buffer, t11) + VPAND(t1, t11, t11) + Comment("r7=e11 ^ e10") + VPXOR(t11, t2, t1) // r7 = e11 ^ e10 = t1 + + Comment("e12=(m6 & k3)") + VMOVDQU(buffer.Offset((8+6)*32), t2) // m6 + VPAND(t2, t6, t2) // e12 + Comment("e13=(m4 & k2)") + VMOVDQU(buffer.Offset((8+4)*32), t6) // m4 + VPAND(t6, t5, t5) // e13 + Comment("r8=e12 ^ e13") + VPXOR(t2, t5, t2) // r8 = e12 ^ e13 = t2 + Comment("e14=(m2 & k1)") + VMOVDQU(buffer.Offset((8+2)*32), t6) // m2 + VPAND(t6, t9, t6) + Comment("r9=e14 ^ e13") + VPXOR(t5, t6, t4) // r9 = e14 ^ e13 = t4 + + Comment("e15=(m8 & k4)") + VMOVDQU(buffer.Offset((8+8)*32), t9) // m8 + VPAND(t9, t7, t9) // e15 + Comment("e16=(g3 & l3)") + VMOVDQU(buffer.Offset(3*32), t7) + VPAND(t7, t8, t8) // e16 + Comment("r10=e15 ^ e16") + VPXOR(t9, t8, t5) // r10 = e15 ^ e16 = t5 + Comment("e17=(g2 & l2)") + VMOVDQU(buffer.Offset(2*32), t9) + VPAND(t3, t9, t3) // e17 + Comment("r11=e17 ^ e16") + VPXOR(t3, t8, t3) // r11 = e17 ^ e16 = t3 - Comment("14=14^6^12^20^28") - PXOR(X3, X0) - PXOR(X7, X0) - PXOR(X9, X0) - PXOR(X5, X0) - MOVOU(X0, buffer.Offset(14*16)) + Comment("start output function") + // t12 = r5 + // t10 = r6 + // t1 = r7 + // t2 = r8 + // t4 = r9 + // t5 = r10 + // t3 = r11 + Comment("[t1]=r7 ^ r9") + VPXOR(t1, t4, t4) // [t1] = t4 = r7 ^ r9 - MOVOU(b.Offset(1*16), X0) - MOVOU(b.Offset(9*16), X1) - MOVOU(b.Offset(17*16), X2) - MOVOU(b.Offset(25*16), X3) - MOVOU(b.Offset(19*16), X5) - MOVOU(b.Offset(23*16), X6) - MOVOU(b.Offset(27*16), X7) - MOVOU(b.Offset(31*16), X8) - MOVOU(b.Offset(3*16), X9) + Comment("[t2]=t1 ^ r1") + VMOVDQU(buffer.Offset((22+1)*32), t6) // r1 + VPXOR(t4, t6, t6) // [t2] = t6 = r1 ^ [t1] - Comment("1=1^25^15^23^31") - MOVOU(X0, X4) - PXOR(X3, X4) - PXOR(b.Offset(15*16), X4) - PXOR(X6, X4) - PXOR(X8, X4) - MOVOU(X4, buffer.Offset(1*16)) + Comment("[t3]=t2 ^ r3") + VMOVDQU(buffer.Offset((22+3)*32), t8) // r3 + VPXOR(t6, t8, t7) // [t3] = t7 = r3 ^ [t2] + Comment("[t4]=r5 ^ r3") + VPXOR(t12, t8, t8) // [t4] = t8 = r5 ^ r3 - Comment("3=3^27^1^9^17") - MOVOU(X0, X4) - PXOR(X9, X4) - PXOR(X7, X4) - PXOR(X1, X4) - PXOR(X2, X4) - MOVOU(X4, buffer.Offset(3*16)) + Comment("[t5]=r4 ^ [t4]") + VMOVDQU(buffer.Offset((22+4)*32), t11) // r4 + VPXOR(t8, t11, t9) // [t5] = t9 = r4 ^ t4 + Comment("[t6]=r0 ^ r4") + VPXOR(buffer.Offset(22*32), t11, t11) // [t6] = t11 = r4 ^ r0 - Comment("9=9^1^23^31^7") - MOVOU(X0, X4) - PXOR(X1, X4) - PXOR(X6, X4) - PXOR(X8, X4) - PXOR(b.Offset(7*16), X4) - MOVOU(X4, buffer.Offset(9*16)) + Comment("[t7]=r11 ^ r7") + VPXOR(t3, t1, t1) // [t7] t1 = r7 ^ r11 - Comment("19=1^19^11^17^25") - MOVOU(X0, X4) - PXOR(X5, X4) - PXOR(b.Offset(11*16), X4) - PXOR(X2, X4) - PXOR(X3, X4) - MOVOU(X4, buffer.Offset(19*16)) + Comment("[t8]=[t1] ^ [t4]") + VPXOR(t4, t8, t8) // t8 = t4 ^ t11 + Comment("store t8") + VMOVDQU(t8, b.Offset(5*32)) - Comment("27=1^27^19^25^9") - PXOR(X1, X0) - PXOR(X7, X0) - PXOR(X5, X0) - PXOR(X3, X0) - MOVOU(X0, buffer.Offset(27*16)) + Comment("[t9]=[t1] ^ [t6]") + VPXOR(t11, t4, t4) // [t9] = t4 + Comment("store t9") + VMOVDQU(t4, b.Offset(2*32)) - Comment("11=11^3^9^17^25") - MOVOU(X9, X4) - PXOR(b.Offset(11*16), X4) - PXOR(X1, X4) - PXOR(X2, X4) - PXOR(X3, X4) - MOVOU(X4, buffer.Offset(11*16)) + Comment("[t10]=r2 ^ t5") + VPXOR(buffer.Offset((22+2)*32), t9, t9) // [t10] t9 = r2 ^ [t5] + Comment("[t11]=r10 ^ r8") + VPXOR(t5, t2, t2) // [t11] = t2 + Comment("store t11") + VMOVDQU(t2, b.Offset(3*32)) + Comment("[t12]=^([t3] ^ [t11])") + VPXOR(t7, t2, t2) + VPANDN(f, t2, t2) // [t12] = t2 + Comment("store t12") + VMOVDQU(t2, b.Offset(1*32)) + Comment("[t13]=[t10] ^ [t12]") + VPXOR(t2, t9, t9) // [t13] = t9 + Comment("store t13") + VMOVDQU(t9, b.Offset(6*32)) - MOVOU(b.Offset(7*16), X0) - MOVOU(b.Offset(15*16), X5) - Comment("17=17^9^31^7^15") - PXOR(X2, X1) - PXOR(X8, X1) - PXOR(X0, X1) - PXOR(X5, X1) - MOVOU(X1, buffer.Offset(17*16)) + Comment("[t14]=^([t3] ^ [t7])") + VPXOR(t7, t1, t1) + VPANDN(f, t1, t1) // [t14] + Comment("store t14") + VMOVDQU(t1, b.Offset(4*32)) + Comment("[t16]=[t6] ^ [t14]") + VPXOR(t11, t1, t1) // [t16] + Comment("store t16") + VMOVDQU(t1, b) - Comment("25=25^17^7^15^23") - PXOR(X3, X2) - PXOR(X0, X2) - PXOR(X5, X2) - PXOR(X6, X2) - MOVOU(X2, buffer.Offset(25*16)) + Comment("[t15]=^(r10 ^ r6)") + VPXOR(t10, t5, t5) + VPANDN(f, t5, t5) + Comment("store t15") + VMOVDQU(t5, b.Offset(7*32)) - MOVOU(b.Offset(5*16), X1) - MOVOU(b.Offset(11*16), X2) - MOVOU(b.Offset(13*16), X3) - // X0=7, X1=5, X9=X4=3, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31 - Comment("5=5^29^3^11^19") - MOVOU(X9, X4) - PXOR(X1, X4) - PXOR(X2, X4) - PXOR(b.Offset(19*16), X4) - PXOR(b.Offset(29*16), X4) - MOVOU(X4, buffer.Offset(5*16)) + VZEROUPPER() + RET() +} - Comment("21=21^13^19^27^3") - MOVOU(X9, X4) - PXOR(b.Offset(21*16), X4) - PXOR(X3, X4) - PXOR(b.Offset(19*16), X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(21*16)) +func xorRoundKey256avx2() { + // xorRoundKey256avx2 function + TEXT("xorRoundKey256avx2", NOSPLIT, "func(rk uint32, x1, x2, x3, out *byte)") + Doc("xor x1, x2, x3 with round key, 32 bytes per bit") - Comment("29=29^21^27^3^11") - PXOR(b.Offset(29*16), X9) - PXOR(b.Offset(21*16), X9) - PXOR(X7, X9) - PXOR(X2, X9) - MOVOU(X9, buffer.Offset(29*16)) + x := Load(Param("rk"), GP32()) + x1 := Mem{Base: Load(Param("x1"), GP64())} + x2 := Mem{Base: Load(Param("x2"), GP64())} + x3 := Mem{Base: Load(Param("x3"), GP64())} + out := Mem{Base: Load(Param("out"), GP64())} - MOVOU(b.Offset(21*16), X9) - // X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=27, X8=31 + ret := YMM() + one := YMM() + VPCMPEQB(one, one, one) - Comment("7=7^31^5^13^21") - MOVOU(X1, X4) - PXOR(X0, X4) - PXOR(X3, X4) - PXOR(X8, X4) - PXOR(X9, X4) - MOVOU(X4, buffer.Offset(7*16)) + y := GP32() - Comment("13=13^5^11^19^27") - MOVOU(X1, X4) - PXOR(X3, X4) - PXOR(X2, X4) - PXOR(b.Offset(19*16), X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(13*16)) + count := GP64() + XORQ(count, count) + Comment("Handle first byte") + MOVL(U32(0x01000000), y) + Label("rk_loop_1") + VMOVDQU(x1.Idx(count, 1), ret) + VPXOR(x2.Idx(count, 1), ret, ret) + VPXOR(x3.Idx(count, 1), ret, ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_1_c")) + VPXOR(one, ret, ret) + Label("rk_loop_1_c") + VMOVDQU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(32), count) + CMPQ(count, U32(256)) + JL(LabelRef("rk_loop_1")) - MOVOU(b.Offset(29*16), X7) - // X0=7, X1=5, X9=21, X2=11, X3=13, X5=15, X6=23, X7=29, X8=31 - Comment("23=23^15^21^29^5") - MOVOU(X1, X4) - PXOR(X5, X4) - PXOR(X6, X4) - PXOR(X9, X4) - PXOR(X7, X4) - MOVOU(X4, buffer.Offset(23*16)) + Comment("Handle second byte") + MOVL(U32(0x00010000), y) + Label("rk_loop_2") + VMOVDQU(x1.Idx(count, 1), ret) + VPXOR(x2.Idx(count, 1), ret, ret) + VPXOR(x3.Idx(count, 1), ret, ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_2_c")) + VPXOR(one, ret, ret) + Label("rk_loop_2_c") + VMOVDQU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(32), count) + CMPQ(count, U32(512)) + JL(LabelRef("rk_loop_2")) - Comment("31=31^23^29^5^13") - PXOR(X8, X1) - PXOR(X6, X1) - PXOR(X3, X1) - PXOR(X7, X1) - MOVOU(X1, buffer.Offset(31*16)) + Comment("Handle third byte") + MOVL(U32(0x00000100), y) + Label("rk_loop_3") + VMOVDQU(x1.Idx(count, 1), ret) + VPXOR(x2.Idx(count, 1), ret, ret) + VPXOR(x3.Idx(count, 1), ret, ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_3_c")) + VPXOR(one, ret, ret) + Label("rk_loop_3_c") + VMOVDQU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(32), count) + CMPQ(count, U32(768)) + JL(LabelRef("rk_loop_3")) - Comment("15=15^7^13^21^29") - PXOR(X3, X0) - PXOR(X7, X0) - PXOR(X9, X0) - PXOR(X5, X0) - MOVOU(X0, buffer.Offset(15*16)) + Comment("Handle last byte") + MOVL(U32(0x00000001), y) + Label("rk_loop_4") + VMOVDQU(x1.Idx(count, 1), ret) + VPXOR(x2.Idx(count, 1), ret, ret) + VPXOR(x3.Idx(count, 1), ret, ret) + TESTL(x, y) + JZ(LabelRef("rk_loop_4_c")) + VPXOR(one, ret, ret) + Label("rk_loop_4_c") + VMOVDQU(ret, out.Idx(count, 1)) + ROLL(U8(1), y) + ADDQ(U8(32), count) + CMPQ(count, U32(1024)) + JL(LabelRef("rk_loop_4")) RET() } @@ -2019,11 +3562,17 @@ func main() { transpose128Rev() transpose128avx(flipMask) transpose128RevAvx(flipMask) + transpose256avx(flipMask) + transpose128x256avx2(flipMask) + transpose256RevAvx(flipMask) xor32x128() xor32x128avx() xorRoundKey128() sbox128() l128() + l256() + sbox256avx2() + xorRoundKey256avx2() Generate() } diff --git a/bs128.go b/bs128.go index f2cc2a6..127da66 100644 --- a/bs128.go +++ b/bs128.go @@ -20,7 +20,7 @@ func (bs bs128) tao(x, buffer []byte) []byte { } func (bs bs128) xor32(x1, x2 []byte) []byte { - xor32x128avx(&x1[0], &x2[0], &x1[0]) + xor32x128avx(32*bs.bytes(), &x1[0], &x2[0], &x1[0]) return x1 } diff --git a/bs128_test.go b/bs128_test.go index d978198..e6a1604 100644 --- a/bs128_test.go +++ b/bs128_test.go @@ -94,20 +94,16 @@ func BenchmarkTao128(b *testing.B) { } func TestL128(t *testing.T) { - x := make([]byte, 32*BS128.bytes()) buffer := make([]byte, 32*BS128.bytes()) - b := 0x00010203 ^ bits.RotateLeft32(0x00010203, 2) ^ bits.RotateLeft32(0x00010203, 10) ^ bits.RotateLeft32(0x00010203, 18) ^ bits.RotateLeft32(0x00010203, 24) + b := uint32(0xe0e7eef5) ^ bits.RotateLeft32(0xe0e7eef5, 2) ^ bits.RotateLeft32(0xe0e7eef5, 10) ^ bits.RotateLeft32(0xe0e7eef5, 18) ^ bits.RotateLeft32(0xe0e7eef5, 24) expected := newUint32x128(b) - copy(x, newByte128(byte(0))) - copy(x[8*BS128.bytes():], newByte128(byte(1))) - copy(x[16*BS128.bytes():], newByte128(byte(2))) - copy(x[24*BS128.bytes():], newByte128(byte(3))) + x := newUint32x128(0xe0e7eef5) ret := BS128.l(x, buffer) if !bytes.Equal(ret, expected) { - t.Fatalf("unexpected l128 result, expected %x, got %x", expected, ret) + t.Fatalf("unexpected l256 result, expected %x, got %x", expected, ret) } } diff --git a/bs_amd64.go b/bs_amd64.go index 5d8a25e..ed77520 100644 --- a/bs_amd64.go +++ b/bs_amd64.go @@ -22,11 +22,20 @@ func transpose128avx(in *byte, out *byte) // Bit level matrix transpose, b0-b1-b2-b3, 128x128 func transpose128RevAvx(in *byte, out *byte) +// Bit level matrix transpose, 256x128 => 128x256 +func transpose256avx(in *byte, out *byte) + +// Bit level matrix transpose, 128x256 => 256x128, just for test here. +func transpose128x256avx2(in *byte, out *byte) + +// Bit level matrix transpose, b0-b1-b2-b3, 128x256 +func transpose256RevAvx(in *byte, out *byte) + // out = x xor y func xor32x128(x *byte, y *byte, out *byte) // out = x xor y -func xor32x128avx(x *byte, y *byte, out *byte) +func xor32x128avx(len int, x *byte, y *byte, out *byte) // xor x1, x2, x3 with round key, 16 bytes per bit func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte) @@ -36,3 +45,12 @@ func sbox128(x *byte, buffer *byte) // l128, 128 bits per 'byte' func l128(x *byte, buffer *byte) + +// l256, 256 bits per 'byte' +func l256(x *byte, buffer *byte) + +// sbox256avx2, 256 bits per 'byte' +func sbox256avx2(x *byte, buffer *byte) + +// xor x1, x2, x3 with round key, 32 bytes per bit +func xorRoundKey256avx2(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte) diff --git a/bs_amd64.s b/bs_amd64.s index 1850558..556e139 100644 --- a/bs_amd64.s +++ b/bs_amd64.s @@ -3106,946 +3106,4023 @@ col_loop_b0: VZEROUPPER RET +// func transpose256avx(in *byte, out *byte) +// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3 +TEXT ·transpose256avx(SB), NOSPLIT, $0-16 + MOVQ in+0(FP), AX + MOVQ out+8(FP), CX + + // Initialize rr, current row + XORQ SI, SI + +row_loop: + // Initialize cc, current col + XORQ BX, BX + +col_loop: + // Initialize (rr * ncols + cc) / 8, here ncols=128 + MOVQ SI, DI + + // Multiple with ncols + SHLQ $0x07, DI + ADDQ BX, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of first 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x10, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x10, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x08, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + ADDQ $0x08, BX + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x08, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + ADDQ $0x08, BX + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x08, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + ADDQ $0x08, BX + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 256 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x08, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x20, DI + ADDQ $0x08, BX + + // Compare cc with ncols, here ncols=128 + CMPQ BX, $0x80 + JL col_loop + ADDQ $0x20, SI + + // Compare rr with nrows, here nrows=256 + CMPQ SI, $0x00000100 + JL row_loop + VZEROUPPER + RET + +// func transpose128x256avx2(in *byte, out *byte) +// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3 +TEXT ·transpose128x256avx2(SB), NOSPLIT, $0-16 + MOVQ in+0(FP), AX + MOVQ out+8(FP), CX + + // Initialize rr, current row + XORQ SI, SI + +row_loop: + // Initialize cc, current col + XORQ BX, BX + +col_loop: + // Initialize (rr * ncols + cc) / 8, here ncols=256 + MOVQ SI, DI + + // Multiple with ncols + SHLQ $0x08, DI + ADDQ BX, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of first 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x07, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, BX + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x07, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, BX + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x07, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, BX + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ BX, DI + ADDQ $0x07, DI + + // Multiple with nrows + SHLQ $0x07, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, BX + + // Compare cc with ncols, here ncols=256 + CMPQ BX, $0x00000100 + JL col_loop + ADDQ $0x20, SI + + // Compare rr with nrows, here nrows=128 + CMPQ SI, $0x00000080 + JL row_loop + VZEROUPPER + RET + +// func transpose256RevAvx(in *byte, out *byte) +// Requires: AVX, AVX2, SSE2, SSE4.1, SSSE3 +TEXT ·transpose256RevAvx(SB), NOSPLIT, $0-16 + MOVQ in+0(FP), AX + MOVQ out+8(FP), CX + + // Initialize rr, current row, 96 + XORQ BX, BX + + // Initialize cc, current col + XORQ SI, SI + +col_loop_b3: + // Initialize (rr * ncols + cc) / 8, here ncols=256 + MOVQ $0x00006000, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of the 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + + // Compare cc with ncols, here ncols=256 + CMPQ SI, $0x00000100 + JL col_loop_b3 + ADDQ $0x20, BX + + // Initialize cc, current col + XORQ SI, SI + +col_loop_b2: + // Initialize (rr * ncols + cc) / 8, here ncols=256 + MOVQ $0x00004000, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of the 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x04, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + + // Compare cc with ncols, here ncols=256 + CMPQ SI, $0x00000100 + JL col_loop_b2 + ADDQ $0x20, BX + + // Initialize cc, current col + XORQ SI, SI + +col_loop_b1: + // Initialize (rr * ncols + cc) / 8, here ncols=256 + MOVQ $0x00002000, DI + ADDQ SI, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of the 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x08, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x08, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x08, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x08, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + + // Compare cc with ncols, here ncols=256 + CMPQ SI, $0x00000100 + JL col_loop_b1 + ADDQ $0x20, BX + + // Initialize cc, current col + XORQ SI, SI + +col_loop_b0: + // Initialize (rr * ncols + cc) / 8, here ncols=256 + MOVQ SI, DI + SHRQ $0x03, DI + + // Construct eight XMM with first 4 bytes of first 32 rows + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X2 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X2 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X2 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X3 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X3 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X3 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X4 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X4 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X4 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X5 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X5 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X5 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X6 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X6 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X6 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X7 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X7 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X7 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X8 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X8 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X8 + MOVL (AX)(DI*1), DX + PINSRD $0x00, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x01, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x02, DX, X9 + ADDQ $0x20, DI + MOVL (AX)(DI*1), DX + PINSRD $0x03, DX, X9 + ADDQ $0x20, DI + PSHUFB flip_mask<>+0(SB), X9 + + // Matrix transform 4x4 + VPUNPCKHDQ X3, X2, X1 + VPUNPCKLDQ X3, X2, X2 + VPUNPCKLDQ X5, X4, X0 + VPUNPCKHDQ X5, X4, X4 + VPUNPCKHQDQ X0, X2, X3 + VPUNPCKLQDQ X0, X2, X2 + VPUNPCKHQDQ X4, X1, X5 + VPUNPCKLQDQ X4, X1, X4 + VPUNPCKHDQ X7, X6, X1 + VPUNPCKLDQ X7, X6, X6 + VPUNPCKLDQ X9, X8, X0 + VPUNPCKHDQ X9, X8, X8 + VPUNPCKHQDQ X0, X6, X7 + VPUNPCKLQDQ X0, X6, X6 + VPUNPCKHQDQ X8, X1, X9 + VPUNPCKLQDQ X8, X1, X8 + MOVOU X2, X0 + VINSERTI128 $0x01, X6, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x0c, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X3, X0 + VINSERTI128 $0x01, X7, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x0c, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X4, X0 + VINSERTI128 $0x01, X8, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x0c, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + MOVOU X5, X0 + VINSERTI128 $0x01, X9, Y0, Y0 + + // Initialize ((cc + 7) * nrows + rr) / 8, here nrows = 128 + MOVQ SI, DI + ADDQ $0x07, DI + SHLQ $0x04, DI + ADDQ $0x0c, DI + + // Get the most significant bit of each 8-bit element in the YMM, and store the returned 4 bytes + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + VPMOVMSKB Y0, DX + MOVL DX, (CX)(DI*1) + VPSLLQ $0x01, Y0, Y0 + + // Sub nrows / 8 + SUBQ $0x10, DI + ADDQ $0x08, SI + + // Compare cc with ncols, here ncols=256 + CMPQ SI, $0x00000100 + JL col_loop_b0 + VZEROUPPER + RET + // func xor32x128(x *byte, y *byte, out *byte) // Requires: SSE2 -TEXT ·xor32x128(SB), NOSPLIT, $0-24 - MOVQ x+0(FP), AX - MOVQ y+8(FP), CX - MOVQ out+16(FP), DX - XORQ BX, BX +TEXT ·xor32x128(SB), NOSPLIT, $0-24 + MOVQ x+0(FP), AX + MOVQ y+8(FP), CX + MOVQ out+16(FP), DX + XORQ BX, BX + +xor32_loop: + MOVOU (AX)(BX*1), X0 + MOVOU (CX)(BX*1), X1 + PXOR X0, X1 + MOVOU X1, (DX)(BX*1) + ADDQ $0x10, BX + CMPQ BX, $0x00000200 + JL xor32_loop + RET + +// func xor32x128avx(len int, x *byte, y *byte, out *byte) +// Requires: AVX, AVX2 +TEXT ·xor32x128avx(SB), NOSPLIT, $0-32 + MOVQ x+8(FP), AX + MOVQ y+16(FP), CX + MOVQ out+24(FP), DX + MOVQ len+0(FP), BX + XORQ SI, SI + +xor32_loop_avx: + VMOVDQU (AX)(SI*1), Y0 + VMOVDQU (CX)(SI*1), Y1 + VPXOR Y0, Y1, Y1 + VMOVDQU Y1, (DX)(SI*1) + ADDQ $0x20, SI + CMPQ SI, BX + JL xor32_loop_avx + VZEROUPPER + RET + +// func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte) +// Requires: SSE2 +TEXT ·xorRoundKey128(SB), NOSPLIT, $0-40 + MOVL rk+0(FP), AX + MOVQ x1+8(FP), CX + MOVQ x2+16(FP), DX + MOVQ x3+24(FP), BX + MOVQ out+32(FP), SI + PCMPEQB X1, X1 + XORQ R8, R8 + + // Handle first byte + MOVL $0x01000000, DI + +rk_loop_1: + MOVOU (CX)(R8*1), X0 + PXOR (DX)(R8*1), X0 + PXOR (BX)(R8*1), X0 + TESTL AX, DI + JZ rk_loop_1_c + PXOR X1, X0 + +rk_loop_1_c: + MOVOU X0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x10, R8 + CMPQ R8, $0x00000080 + JL rk_loop_1 + + // Handle second byte + MOVL $0x00010000, DI + +rk_loop_2: + MOVOU (CX)(R8*1), X0 + PXOR (DX)(R8*1), X0 + PXOR (BX)(R8*1), X0 + TESTL AX, DI + JZ rk_loop_2_c + PXOR X1, X0 + +rk_loop_2_c: + MOVOU X0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x10, R8 + CMPQ R8, $0x00000100 + JL rk_loop_2 + + // Handle third byte + MOVL $0x00000100, DI + +rk_loop_3: + MOVOU (CX)(R8*1), X0 + PXOR (DX)(R8*1), X0 + PXOR (BX)(R8*1), X0 + TESTL AX, DI + JZ rk_loop_3_c + PXOR X1, X0 + +rk_loop_3_c: + MOVOU X0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x10, R8 + CMPQ R8, $0x00000180 + JL rk_loop_3 + + // Handle last byte + MOVL $0x00000001, DI + +rk_loop_4: + MOVOU (CX)(R8*1), X0 + PXOR (DX)(R8*1), X0 + PXOR (BX)(R8*1), X0 + TESTL AX, DI + JZ rk_loop_4_c + PXOR X1, X0 + +rk_loop_4_c: + MOVOU X0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x10, R8 + CMPQ R8, $0x00000200 + JL rk_loop_4 + RET + +// func sbox128(x *byte, buffer *byte) +// Requires: SSE2 +TEXT ·sbox128(SB), NOSPLIT, $0-16 + MOVQ x+0(FP), AX + MOVQ buffer+8(FP), CX + + // f, for not operation + PCMPEQB X0, X0 + + // Start input function + // t1=b7 ^ b5 + MOVOU 112(AX), X1 + PXOR 80(AX), X1 + MOVOU 16(AX), X2 + MOVOU X2, X3 + MOVOU X2, X4 + + // store m6=b1 + MOVOU X2, 224(CX) + + // t2=b5 ^ b1 + PXOR 80(AX), X2 + PANDN X0, X2 + + // store g5=^b0 + MOVOU (AX), X5 + MOVOU X5, X6 + PANDN X0, X6 + MOVOU X6, 80(CX) + + // t3=^(b0 ^ t2) + PXOR X2, X5 + PANDN X0, X5 + + // t4=b6 ^ b2 + MOVOU 96(AX), X6 + MOVOU X6, X7 + PXOR 32(AX), X6 + + // t5=b3 ^ t3 + MOVOU 48(AX), X8 + MOVOU X8, X9 + PXOR X5, X8 + + // t6=b4 ^ t1 + MOVOU 64(AX), X10 + PXOR X1, X10 + + // t7=b1 ^ t5 + PXOR X8, X3 + + // t8=b1 ^ t4 + PXOR X6, X4 + + // t9=t6 ^ t8 + MOVOU X10, X11 + PXOR X4, X11 + + // store m8 + MOVOU X11, 256(CX) + + // store g1 + MOVOU X3, 16(CX) + + // store g3 + MOVOU X8, 48(CX) + + // store g4 + MOVOU X2, 64(CX) + + // store m0 + MOVOU X10, 128(CX) + + // store m1 + MOVOU X5, 144(CX) + + // store m2 + MOVOU X4, 160(CX) + + // store m4 + MOVOU X6, 192(CX) + + // t11=^(b3 ^ t1) + PXOR X1, X9 + PANDN X0, X9 + + // store m5, can reuse t1 now + MOVOU X9, 208(CX) + + // t12=^(b6 ^ t9) + PXOR X11, X7 + PANDN X0, X7 + + // store m9, can reuse t7 t8 t9 now + MOVOU X7, 272(CX) + + // t10=t6 ^ t7 + PXOR X10, X3 + + // store g0, can reuse t6 now + MOVOU X3, (CX) + + // t13=t4 ^ t10 + PXOR X6, X3 + + // store g2, can reuse t4 now + MOVOU X3, 32(CX) + + // t14=t2 ^ t11 + MOVOU X9, X1 + PXOR X2, X1 + + // store g6, can reuse t2 now + MOVOU X1, 96(CX) + + // t15=t12^t14 + PXOR X7, X1 + + // store g7 + MOVOU X1, 112(CX) + + // t16=t3 ^ t12 + PXOR X5, X7 + + // store m3 + MOVOU X7, 176(CX) + + // t17=t11 ^ t16 + PXOR X9, X7 + + // store m7 + MOVOU X7, 240(CX) + + // Start top function + // Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8 + // t2=m0 & m1 + PAND X10, X5 + + // t3=g0 & g4 + PAND (CX), X2 + + // t4=g3 & g7 + MOVOU X1, X10 + PAND X8, X1 + + // t7=g3 | g7 + POR X10, X8 + + // t11=m4 & m5 + PAND X6, X9 + MOVOU 176(CX), X6 + MOVOU X6, X10 + + // t10=m3 & m2 + PAND X4, X10 + + // t12=m3 | m2 + POR X4, X6 + + // t6=g6 | g2 + POR 96(CX), X3 + + // t9=m6 | m7 + POR 224(CX), X7 + MOVOU 272(CX), X4 + MOVOU X4, X12 + + // t5=m8 & m9 + PAND X11, X4 + + // t8=m8 | m9 + POR X11, X12 + + // t14 = t3 ^ t2 + PXOR X5, X2 + + // t16 = t5 ^ t14 + PXOR X2, X4 + + // t20 = t16 ^ t7 + PXOR X4, X8 + + // t17 = t9 ^ t10 + PXOR X7, X10 + + // t18 = t11 ^ t12 + PXOR X9, X6 + + // p2 = t20 ^ t18 + PXOR X8, X6 + + // p0 = t6 ^ t16 + PXOR X3, X4 + + // t1 = g5 & g1 + MOVOU 16(CX), X2 + MOVOU 80(CX), X8 + PAND X2, X8 + + // t13 = t1 ^ t2 + PXOR X8, X5 + + // t15 = t13 ^ t4 + PXOR X1, X5 + + // t19 = t6 ^ t15 + PXOR X5, X3 + + // p3 = t19 ^ t17 + PXOR X10, X3 + + // p1 = t8 ^ t15 + PXOR X12, X5 + + // start middle function + // current register status: t8=p0, t3=p1, t4=p2, t7=p0 + // t0 = p1 & p2 + MOVOU X5, X1 + PAND X6, X1 + + // t1 = p3 & p0 + MOVOU X4, X2 + PAND X3, X2 + + // t2 = p0 & p2 + MOVOU X6, X8 + PAND X4, X8 + + // t3 = p1 & p3 + MOVOU X5, X10 + PAND X3, X10 + + // t4 = t0 & t2 + MOVOU X1, X11 + PAND X8, X11 + + // t5 = t1 & t3 + MOVOU X2, X12 + PXOR X10, X12 + + // t6 = t5 | p0 + POR X12, X4 + + // t7 = t2 | p3 + POR X8, X3 + + // t8 = t4 ^ t6 + PXOR X11, X4 + + // t9 = t7 ^ t3 + PXOR X3, X10 + + // t10 = t0 ^ t9 + PXOR X1, X10 + + // t11 = p2 | t5 + POR X12, X6 + + // l1 = t11 ^ t1 + PXOR X6, X2 + + // t12 = p1 | t2 + POR X8, X5 + + // l2 = t12 ^ t5 + PXOR X12, X5 + + // start bottom function + // current register status: t6=l0, t2=l1, t3=l2, t8=l3 + // k4 = l2 ^ l3 + MOVOU X4, X8 + PXOR X5, X8 + + // k3 = l1 ^ l3 + MOVOU X4, X6 + PXOR X2, X6 + + // k2 = l0 ^ l2 + MOVOU X10, X3 + PXOR X5, X3 + + // k0 = l0 ^ l1 + MOVOU X10, X1 + PXOR X2, X1 + + // k1 = k2 ^ k3 + MOVOU X6, X11 + PXOR X3, X11 + + // e0=(m1 & k0) + MOVOU 144(CX), X12 + PAND X1, X12 + + // e1=(g5 & l1) + MOVOU 80(CX), X9 + PAND X2, X9 + + // r0=e0 ^ e1 + PXOR X9, X12 + + // e2=(g4 & l0) + MOVOU 64(CX), X7 + PAND X10, X7 + + // r1=e2 ^ e1 + PXOR X7, X9 + + // store r0 r1 + MOVOU X12, 352(CX) + MOVOU X9, 368(CX) + + // e3=(m7 & k3) + MOVOU 240(CX), X12 + PAND X6, X12 + + // e4=(m5 & k2) + MOVOU 208(CX), X9 + PAND X3, X9 + + // r2=e3 ^ e4 + PXOR X9, X12 + + // e5=(m3 & k1) + MOVOU 176(CX), X7 + PAND X11, X7 + + // r3=e5 ^ e4 + PXOR X7, X9 + + // store r2 r3 + MOVOU X12, 384(CX) + MOVOU X9, 400(CX) + + // e6=(m9 & k4) + MOVOU 272(CX), X12 + PAND X8, X12 + + // e7=(g7 & l3) + MOVOU 112(CX), X9 + PAND X4, X9 + + // r4=e7 ^ e6 + PXOR X9, X12 + + // e8=(g6 & l2) + MOVOU 96(CX), X7 + PAND X5, X7 + + // r5=e8 ^ e6 + PXOR X9, X7 + + // store r4 + MOVOU X12, 416(CX) + + // e9=(m0 & k0) + MOVOU 128(CX), X12 + PAND X1, X12 + + // e10=(g1 & l1) + MOVOU 16(CX), X1 + PAND X2, X1 + + // r6=e9 ^ e10 + PXOR X1, X12 + + // e11=(g0 & l0) + MOVOU (CX), X9 + PAND X9, X10 + + // r7=e11 ^ e10 + PXOR X10, X1 + + // e12=(m6 & k3) + MOVOU 224(CX), X2 + PAND X6, X2 + + // e13=(m4 & k2) + MOVOU 192(CX), X10 + PAND X3, X10 + + // r8=e12 ^ e13 + PXOR X10, X2 + + // e14=(m2 & k1) + MOVOU 160(CX), X6 + PAND X11, X6 + + // r9=e14 ^ e13 + PXOR X10, X6 + + // e15=(m8 & k4) + MOVOU 256(CX), X11 + PAND X11, X8 + + // e16=(g3 & l3) + MOVOU 48(CX), X11 + PAND X11, X4 + + // r10=e15 ^ e16 + PXOR X4, X8 + + // e17=(g2 & l2) + MOVOU 32(CX), X9 + PAND X9, X5 + + // r11=e17 ^ e16 + PXOR X4, X5 + + // start output function + // [t1]=r7 ^ r9 + PXOR X1, X6 + + // [t2]=t1 ^ r1 + MOVOU 368(CX), X10 + PXOR X6, X10 + + // [t3]=t2 ^ r3 + MOVOU 400(CX), X3 + MOVOU X3, X4 + PXOR X10, X3 + + // [t4]=r5 ^ r3 + PXOR X7, X4 + + // [t5]=r4 ^ [t4] + MOVOU 416(CX), X11 + MOVOU X11, X9 + PXOR X4, X11 + + // [t6]=r0 ^ r4 + PXOR 352(CX), X9 + + // [t7]=r11 ^ r7 + PXOR X5, X1 + + // [t8]=[t1] ^ [t4] + PXOR X6, X4 + + // store t8 + MOVOU X4, 80(AX) + + // [t9]=[t1] ^ [t6] + PXOR X9, X6 + + // store t9 + MOVOU X6, 32(AX) + + // [t10]=r2 ^ t5 + PXOR 384(CX), X11 + + // [t11]=r10 ^ r8 + PXOR X8, X2 + + // store t11 + MOVOU X2, 48(AX) + + // [t12]=^([t3] ^ [t11]) + PXOR X3, X2 + PANDN X0, X2 + + // store t12 + MOVOU X2, 16(AX) + + // [t13]=[t10] ^ [t12] + PXOR X2, X11 + + // store t13 + MOVOU X11, 96(AX) + + // [t14]=^([t3] ^ [t7]) + PXOR X3, X1 + PANDN X0, X1 + + // store t14 + MOVOU X1, 64(AX) + + // [t16]=[t6] ^ [t14] + PXOR X9, X1 + + // store t16 + MOVOU X1, (AX) + + // [t15]=^(r10 ^ r6) + PXOR X12, X8 + PANDN X0, X8 + + // store t15 + MOVOU X8, 112(AX) + RET + +// func l128(x *byte, buffer *byte) +// Requires: SSE2 +TEXT ·l128(SB), NOSPLIT, $0-16 + MOVQ x+0(FP), AX + MOVQ buffer+8(FP), CX + MOVOU (AX), X0 + MOVOU 128(AX), X1 + MOVOU 256(AX), X2 + MOVOU 384(AX), X3 + MOVOU 288(AX), X5 + MOVOU 352(AX), X6 + MOVOU 416(AX), X7 + MOVOU 480(AX), X8 + MOVOU 32(AX), X9 + + // 0=0^24^14^22^30 + MOVOU X0, X4 + PXOR X3, X4 + PXOR 224(AX), X4 + PXOR X6, X4 + PXOR X8, X4 + MOVOU X4, (CX) + + // 2=0^2^26^8^16 + MOVOU X0, X4 + PXOR X9, X4 + PXOR X7, X4 + PXOR X1, X4 + PXOR X2, X4 + MOVOU X4, 32(CX) + + // 8=0^8^22^30^6 + MOVOU X0, X4 + PXOR X1, X4 + PXOR X6, X4 + PXOR X8, X4 + PXOR 96(AX), X4 + MOVOU X4, 128(CX) + + // 18=0^18^10^16^24 + MOVOU X0, X4 + PXOR X5, X4 + PXOR 160(AX), X4 + PXOR X2, X4 + PXOR X3, X4 + MOVOU X4, 288(CX) + + // 26=0^26^18^24^8 + PXOR X1, X0 + PXOR X7, X0 + PXOR X5, X0 + PXOR X3, X0 + MOVOU X0, 416(CX) + + // 10=10^2^8^16^24 + MOVOU X9, X4 + PXOR 160(AX), X4 + PXOR X1, X4 + PXOR X2, X4 + PXOR X3, X4 + MOVOU X4, 160(CX) + MOVOU 96(AX), X0 + MOVOU 224(AX), X5 + + // 16=16^8^30^6^14 + PXOR X2, X1 + PXOR X8, X1 + PXOR X0, X1 + PXOR X5, X1 + MOVOU X1, 256(CX) + + // 24=24^16^6^14^22 + PXOR X3, X2 + PXOR X0, X2 + PXOR X5, X2 + PXOR X6, X2 + MOVOU X2, 384(CX) + MOVOU 64(AX), X1 + MOVOU 160(AX), X2 + MOVOU 192(AX), X3 + + // 4=4^28^2^10^18 + MOVOU X9, X4 + PXOR X1, X4 + PXOR X2, X4 + PXOR 288(AX), X4 + PXOR 448(AX), X4 + MOVOU X4, 64(CX) + + // 20=20^12^18^26^2 + MOVOU X9, X4 + PXOR 320(AX), X4 + PXOR X3, X4 + PXOR 288(AX), X4 + PXOR X7, X4 + MOVOU X4, 320(CX) -xor32_loop: - MOVOU (AX)(BX*1), X0 - MOVOU (CX)(BX*1), X1 + // 28=28^20^26^2^10 + PXOR 448(AX), X9 + PXOR 320(AX), X9 + PXOR X7, X9 + PXOR X2, X9 + MOVOU X9, 448(CX) + MOVOU 320(AX), X9 + + // 6=6^30^4^12^20 + MOVOU X1, X4 + PXOR X0, X4 + PXOR X3, X4 + PXOR X8, X4 + PXOR X9, X4 + MOVOU X4, 96(CX) + + // 12=12^4^10^18^26 + MOVOU X1, X4 + PXOR X3, X4 + PXOR X2, X4 + PXOR 288(AX), X4 + PXOR X7, X4 + MOVOU X4, 192(CX) + MOVOU 448(AX), X7 + + // 22=22^14^20^28^4 + MOVOU X1, X4 + PXOR X5, X4 + PXOR X6, X4 + PXOR X9, X4 + PXOR X7, X4 + MOVOU X4, 352(CX) + + // 30=30^22^28^4^12 + PXOR X8, X1 + PXOR X6, X1 + PXOR X3, X1 + PXOR X7, X1 + MOVOU X1, 480(CX) + + // 14=14^6^12^20^28 + PXOR X3, X0 + PXOR X7, X0 + PXOR X9, X0 + PXOR X5, X0 + MOVOU X0, 224(CX) + MOVOU 16(AX), X0 + MOVOU 144(AX), X1 + MOVOU 272(AX), X2 + MOVOU 400(AX), X3 + MOVOU 304(AX), X5 + MOVOU 368(AX), X6 + MOVOU 432(AX), X7 + MOVOU 496(AX), X8 + MOVOU 48(AX), X9 + + // 1=1^25^15^23^31 + MOVOU X0, X4 + PXOR X3, X4 + PXOR 240(AX), X4 + PXOR X6, X4 + PXOR X8, X4 + MOVOU X4, 16(CX) + + // 3=3^27^1^9^17 + MOVOU X0, X4 + PXOR X9, X4 + PXOR X7, X4 + PXOR X1, X4 + PXOR X2, X4 + MOVOU X4, 48(CX) + + // 9=9^1^23^31^7 + MOVOU X0, X4 + PXOR X1, X4 + PXOR X6, X4 + PXOR X8, X4 + PXOR 112(AX), X4 + MOVOU X4, 144(CX) + + // 19=1^19^11^17^25 + MOVOU X0, X4 + PXOR X5, X4 + PXOR 176(AX), X4 + PXOR X2, X4 + PXOR X3, X4 + MOVOU X4, 304(CX) + + // 27=1^27^19^25^9 + PXOR X1, X0 + PXOR X7, X0 + PXOR X5, X0 + PXOR X3, X0 + MOVOU X0, 432(CX) + + // 11=11^3^9^17^25 + MOVOU X9, X4 + PXOR 176(AX), X4 + PXOR X1, X4 + PXOR X2, X4 + PXOR X3, X4 + MOVOU X4, 176(CX) + MOVOU 112(AX), X0 + MOVOU 240(AX), X5 + + // 17=17^9^31^7^15 + PXOR X2, X1 + PXOR X8, X1 PXOR X0, X1 - MOVOU X1, (DX)(BX*1) - ADDQ $0x10, BX - CMPQ BX, $0x00000200 - JL xor32_loop + PXOR X5, X1 + MOVOU X1, 272(CX) + + // 25=25^17^7^15^23 + PXOR X3, X2 + PXOR X0, X2 + PXOR X5, X2 + PXOR X6, X2 + MOVOU X2, 400(CX) + MOVOU 80(AX), X1 + MOVOU 176(AX), X2 + MOVOU 208(AX), X3 + + // 5=5^29^3^11^19 + MOVOU X9, X4 + PXOR X1, X4 + PXOR X2, X4 + PXOR 304(AX), X4 + PXOR 464(AX), X4 + MOVOU X4, 80(CX) + + // 21=21^13^19^27^3 + MOVOU X9, X4 + PXOR 336(AX), X4 + PXOR X3, X4 + PXOR 304(AX), X4 + PXOR X7, X4 + MOVOU X4, 336(CX) + + // 29=29^21^27^3^11 + PXOR 464(AX), X9 + PXOR 336(AX), X9 + PXOR X7, X9 + PXOR X2, X9 + MOVOU X9, 464(CX) + MOVOU 336(AX), X9 + + // 7=7^31^5^13^21 + MOVOU X1, X4 + PXOR X0, X4 + PXOR X3, X4 + PXOR X8, X4 + PXOR X9, X4 + MOVOU X4, 112(CX) + + // 13=13^5^11^19^27 + MOVOU X1, X4 + PXOR X3, X4 + PXOR X2, X4 + PXOR 304(AX), X4 + PXOR X7, X4 + MOVOU X4, 208(CX) + MOVOU 464(AX), X7 + + // 23=23^15^21^29^5 + MOVOU X1, X4 + PXOR X5, X4 + PXOR X6, X4 + PXOR X9, X4 + PXOR X7, X4 + MOVOU X4, 368(CX) + + // 31=31^23^29^5^13 + PXOR X8, X1 + PXOR X6, X1 + PXOR X3, X1 + PXOR X7, X1 + MOVOU X1, 496(CX) + + // 15=15^7^13^21^29 + PXOR X3, X0 + PXOR X7, X0 + PXOR X9, X0 + PXOR X5, X0 + MOVOU X0, 240(CX) RET -// func xor32x128avx(x *byte, y *byte, out *byte) +// func l256(x *byte, buffer *byte) // Requires: AVX, AVX2 -TEXT ·xor32x128avx(SB), NOSPLIT, $0-24 - MOVQ x+0(FP), AX - MOVQ y+8(FP), CX - MOVQ out+16(FP), DX - XORQ BX, BX +TEXT ·l256(SB), NOSPLIT, $0-16 + MOVQ x+0(FP), AX + MOVQ buffer+8(FP), CX + VMOVDQU (AX), Y0 + VMOVDQU 256(AX), Y1 + VMOVDQU 512(AX), Y2 + VMOVDQU 768(AX), Y3 + VMOVDQU 576(AX), Y5 + VMOVDQU 704(AX), Y6 + VMOVDQU 832(AX), Y7 + VMOVDQU 960(AX), Y8 + VMOVDQU 64(AX), Y9 -xor32_loop_avx: - VMOVDQU (AX)(BX*1), Y0 - VMOVDQU (CX)(BX*1), Y1 + // 0=0^24^14^22^30 + VPXOR Y3, Y0, Y4 + VPXOR 448(AX), Y4, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y8, Y4, Y4 + VMOVDQU Y4, (CX) + + // 2=0^2^26^8^16 + VPXOR Y9, Y0, Y4 + VPXOR Y7, Y4, Y4 + VPXOR Y1, Y4, Y4 + VPXOR Y2, Y4, Y4 + VMOVDQU Y4, 64(CX) + + // 8=0^8^22^30^6 + VPXOR Y1, Y0, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y8, Y4, Y4 + VPXOR 192(AX), Y4, Y4 + VMOVDQU Y4, 256(CX) + + // 18=0^18^10^16^24 + VPXOR Y5, Y0, Y4 + VPXOR 320(AX), Y4, Y4 + VPXOR Y2, Y4, Y4 + VPXOR Y3, Y4, Y4 + VMOVDQU Y4, 576(CX) + + // 26=0^26^18^24^8 + VPXOR Y1, Y0, Y0 + VPXOR Y7, Y0, Y0 + VPXOR Y5, Y0, Y0 + VPXOR Y3, Y0, Y0 + VMOVDQU Y0, 832(CX) + + // 10=10^2^8^16^24 + VPXOR 320(AX), Y9, Y4 + VPXOR Y1, Y4, Y4 + VPXOR Y2, Y4, Y4 + VPXOR Y3, Y4, Y4 + VMOVDQU Y4, 320(CX) + VMOVDQU 192(AX), Y0 + VMOVDQU 448(AX), Y5 + + // 16=16^8^30^6^14 + VPXOR Y2, Y1, Y1 + VPXOR Y8, Y1, Y1 VPXOR Y0, Y1, Y1 - VMOVDQU Y1, (DX)(BX*1) - ADDQ $0x20, BX - CMPQ BX, $0x00000200 - JL xor32_loop_avx - VZEROUPPER - RET + VPXOR Y5, Y1, Y1 + VMOVDQU Y1, 512(CX) -// func xorRoundKey128(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte) -// Requires: SSE2 -TEXT ·xorRoundKey128(SB), NOSPLIT, $0-40 - MOVL rk+0(FP), AX - MOVQ x1+8(FP), CX - MOVQ x2+16(FP), DX - MOVQ x3+24(FP), BX - MOVQ out+32(FP), SI - PCMPEQB X1, X1 - XORQ R8, R8 + // 24=24^16^6^14^22 + VPXOR Y3, Y2, Y2 + VPXOR Y0, Y2, Y2 + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + VMOVDQU Y2, 768(CX) + VMOVDQU 128(AX), Y1 + VMOVDQU 320(AX), Y2 + VMOVDQU 384(AX), Y3 - // Handle first byte - MOVL $0x01000000, DI + // 4=4^28^2^10^18 + VPXOR Y1, Y9, Y4 + VPXOR Y2, Y4, Y4 + VPXOR 576(AX), Y4, Y4 + VPXOR 896(AX), Y4, Y4 + VMOVDQU Y4, 128(CX) -rk_loop_1: - MOVOU (CX)(R8*1), X0 - PXOR (DX)(R8*1), X0 - PXOR (BX)(R8*1), X0 - TESTL AX, DI - JZ rk_loop_1_c - PXOR X1, X0 + // 20=20^12^18^26^2 + VPXOR 640(AX), Y9, Y4 + VPXOR Y3, Y4, Y4 + VPXOR 576(AX), Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 640(CX) + + // 28=28^20^26^2^10 + VPXOR 896(AX), Y9, Y9 + VPXOR 640(AX), Y9, Y9 + VPXOR Y7, Y9, Y9 + VPXOR Y2, Y9, Y9 + VMOVDQU Y9, 896(CX) + VMOVDQU 640(AX), Y9 + + // 6=6^30^4^12^20 + VPXOR Y0, Y1, Y4 + VPXOR Y3, Y4, Y4 + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + VMOVDQU Y4, 192(CX) + + // 12=12^4^10^18^26 + VPXOR Y3, Y1, Y4 + VPXOR Y2, Y4, Y4 + VPXOR 576(AX), Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 384(CX) + VMOVDQU 896(AX), Y7 + + // 22=22^14^20^28^4 + VPXOR Y5, Y1, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y9, Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 704(CX) + + // 30=30^22^28^4^12 + VPXOR Y8, Y1, Y1 + VPXOR Y6, Y1, Y1 + VPXOR Y3, Y1, Y1 + VPXOR Y7, Y1, Y1 + VMOVDQU Y1, 960(CX) + + // 14=14^6^12^20^28 + VPXOR Y3, Y0, Y0 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y0, Y0 + VPXOR Y5, Y0, Y0 + VMOVDQU Y0, 448(CX) + VMOVDQU 32(AX), Y0 + VMOVDQU 288(AX), Y1 + VMOVDQU 544(AX), Y2 + VMOVDQU 800(AX), Y3 + VMOVDQU 608(AX), Y5 + VMOVDQU 736(AX), Y6 + VMOVDQU 864(AX), Y7 + VMOVDQU 992(AX), Y8 + VMOVDQU 96(AX), Y9 + + // 1=1^25^15^23^31 + VPXOR Y3, Y0, Y4 + VPXOR 480(AX), Y4, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y8, Y4, Y4 + VMOVDQU Y4, 32(CX) + + // 3=3^27^1^9^17 + VPXOR Y9, Y0, Y4 + VPXOR Y7, Y4, Y4 + VPXOR Y1, Y4, Y4 + VPXOR Y2, Y4, Y4 + VMOVDQU Y4, 96(CX) + + // 9=9^1^23^31^7 + VPXOR Y1, Y0, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y8, Y4, Y4 + VPXOR 224(AX), Y4, Y4 + VMOVDQU Y4, 288(CX) + + // 19=1^19^11^17^25 + VPXOR Y5, Y0, Y4 + VPXOR 352(AX), Y4, Y4 + VPXOR Y2, Y4, Y4 + VPXOR Y3, Y4, Y4 + VMOVDQU Y4, 608(CX) + + // 27=1^27^19^25^9 + VPXOR Y1, Y0, Y0 + VPXOR Y7, Y0, Y0 + VPXOR Y5, Y0, Y0 + VPXOR Y3, Y0, Y0 + VMOVDQU Y0, 864(CX) -rk_loop_1_c: - MOVOU X0, (SI)(R8*1) - ROLL $0x01, DI - ADDQ $0x10, R8 - CMPQ R8, $0x00000080 - JL rk_loop_1 + // 11=11^3^9^17^25 + VPXOR 352(AX), Y9, Y4 + VPXOR Y1, Y4, Y4 + VPXOR Y2, Y4, Y4 + VPXOR Y3, Y4, Y4 + VMOVDQU Y4, 352(CX) + VMOVDQU 224(AX), Y0 + VMOVDQU 480(AX), Y5 - // Handle second byte - MOVL $0x00010000, DI + // 17=17^9^31^7^15 + VPXOR Y2, Y1, Y1 + VPXOR Y8, Y1, Y1 + VPXOR Y0, Y1, Y1 + VPXOR Y5, Y1, Y1 + VMOVDQU Y1, 544(CX) -rk_loop_2: - MOVOU (CX)(R8*1), X0 - PXOR (DX)(R8*1), X0 - PXOR (BX)(R8*1), X0 - TESTL AX, DI - JZ rk_loop_2_c - PXOR X1, X0 + // 25=25^17^7^15^23 + VPXOR Y3, Y2, Y2 + VPXOR Y0, Y2, Y2 + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + VMOVDQU Y2, 800(CX) + VMOVDQU 160(AX), Y1 + VMOVDQU 352(AX), Y2 + VMOVDQU 416(AX), Y3 -rk_loop_2_c: - MOVOU X0, (SI)(R8*1) - ROLL $0x01, DI - ADDQ $0x10, R8 - CMPQ R8, $0x00000100 - JL rk_loop_2 + // 5=5^29^3^11^19 + VPXOR Y1, Y9, Y4 + VPXOR Y2, Y4, Y4 + VPXOR 608(AX), Y4, Y4 + VPXOR 928(AX), Y4, Y4 + VMOVDQU Y4, 160(CX) - // Handle third byte - MOVL $0x00000100, DI + // 21=21^13^19^27^3 + VPXOR 672(AX), Y9, Y4 + VPXOR Y3, Y4, Y4 + VPXOR 608(AX), Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 672(CX) -rk_loop_3: - MOVOU (CX)(R8*1), X0 - PXOR (DX)(R8*1), X0 - PXOR (BX)(R8*1), X0 - TESTL AX, DI - JZ rk_loop_3_c - PXOR X1, X0 + // 29=29^21^27^3^11 + VPXOR 928(AX), Y9, Y9 + VPXOR 672(AX), Y9, Y9 + VPXOR Y7, Y9, Y9 + VPXOR Y2, Y9, Y9 + VMOVDQU Y9, 928(CX) + VMOVDQU 672(AX), Y9 -rk_loop_3_c: - MOVOU X0, (SI)(R8*1) - ROLL $0x01, DI - ADDQ $0x10, R8 - CMPQ R8, $0x00000180 - JL rk_loop_3 + // 7=7^31^5^13^21 + VPXOR Y0, Y1, Y4 + VPXOR Y3, Y4, Y4 + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + VMOVDQU Y4, 224(CX) - // Handle last byte - MOVL $0x00000001, DI + // 13=13^5^11^19^27 + VPXOR Y3, Y1, Y4 + VPXOR Y2, Y4, Y4 + VPXOR 608(AX), Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 416(CX) + VMOVDQU 928(AX), Y7 -rk_loop_4: - MOVOU (CX)(R8*1), X0 - PXOR (DX)(R8*1), X0 - PXOR (BX)(R8*1), X0 - TESTL AX, DI - JZ rk_loop_4_c - PXOR X1, X0 + // 23=23^15^21^29^5 + VPXOR Y5, Y1, Y4 + VPXOR Y6, Y4, Y4 + VPXOR Y9, Y4, Y4 + VPXOR Y7, Y4, Y4 + VMOVDQU Y4, 736(CX) -rk_loop_4_c: - MOVOU X0, (SI)(R8*1) - ROLL $0x01, DI - ADDQ $0x10, R8 - CMPQ R8, $0x00000200 - JL rk_loop_4 + // 31=31^23^29^5^13 + VPXOR Y8, Y1, Y1 + VPXOR Y6, Y1, Y1 + VPXOR Y3, Y1, Y1 + VPXOR Y7, Y1, Y1 + VMOVDQU Y1, 992(CX) + + // 15=15^7^13^21^29 + VPXOR Y3, Y0, Y0 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y0, Y0 + VPXOR Y5, Y0, Y0 + VMOVDQU Y0, 480(CX) + VZEROUPPER RET -// func sbox128(x *byte, buffer *byte) -// Requires: SSE2 -TEXT ·sbox128(SB), NOSPLIT, $0-16 +// func sbox256avx2(x *byte, buffer *byte) +// Requires: AVX, AVX2 +TEXT ·sbox256avx2(SB), NOSPLIT, $0-16 MOVQ x+0(FP), AX MOVQ buffer+8(FP), CX // f, for not operation - PCMPEQB X0, X0 + VPCMPEQB Y0, Y0, Y0 // Start input function // t1=b7 ^ b5 - MOVOU 112(AX), X1 - PXOR 80(AX), X1 - MOVOU 16(AX), X2 - MOVOU X2, X3 - MOVOU X2, X4 + VMOVDQU 224(AX), Y1 + VPXOR 160(AX), Y1, Y1 + VMOVDQU 32(AX), Y4 // store m6=b1 - MOVOU X2, 224(CX) + VMOVDQU Y4, 448(CX) // t2=b5 ^ b1 - PXOR 80(AX), X2 - PANDN X0, X2 - - // store g5=^b0 - MOVOU (AX), X5 - MOVOU X5, X6 - PANDN X0, X6 - MOVOU X6, 80(CX) + VPXOR 160(AX), Y4, Y2 + VPANDN Y0, Y2, Y2 + VMOVDQU (AX), Y6 // t3=^(b0 ^ t2) - PXOR X2, X5 - PANDN X0, X5 + VPXOR Y2, Y6, Y5 + VPANDN Y0, Y5, Y5 + + // store g5=^b0 + VPANDN Y0, Y6, Y6 + VMOVDQU Y6, 160(CX) // t4=b6 ^ b2 - MOVOU 96(AX), X6 - MOVOU X6, X7 - PXOR 32(AX), X6 + VMOVDQU 192(AX), Y7 + VPXOR 64(AX), Y7, Y6 // t5=b3 ^ t3 - MOVOU 48(AX), X8 - MOVOU X8, X9 - PXOR X5, X8 + VMOVDQU 96(AX), Y9 + VPXOR Y5, Y9, Y8 // t6=b4 ^ t1 - MOVOU 64(AX), X10 - PXOR X1, X10 + VPXOR 128(AX), Y1, Y10 // t7=b1 ^ t5 - PXOR X8, X3 + VPXOR Y8, Y4, Y3 // t8=b1 ^ t4 - PXOR X6, X4 + VPXOR Y6, Y4, Y4 // t9=t6 ^ t8 - MOVOU X10, X11 - PXOR X4, X11 + VPXOR Y4, Y10, Y11 // store m8 - MOVOU X11, 256(CX) + VMOVDQU Y11, 512(CX) // store g1 - MOVOU X3, 16(CX) + VMOVDQU Y3, 32(CX) // store g3 - MOVOU X8, 48(CX) + VMOVDQU Y8, 96(CX) // store g4 - MOVOU X2, 64(CX) + VMOVDQU Y2, 128(CX) // store m0 - MOVOU X10, 128(CX) + VMOVDQU Y10, 256(CX) // store m1 - MOVOU X5, 144(CX) + VMOVDQU Y5, 288(CX) // store m2 - MOVOU X4, 160(CX) + VMOVDQU Y4, 320(CX) // store m4 - MOVOU X6, 192(CX) + VMOVDQU Y6, 384(CX) // t11=^(b3 ^ t1) - PXOR X1, X9 - PANDN X0, X9 + VPXOR Y1, Y9, Y9 + VPANDN Y0, Y9, Y9 // store m5, can reuse t1 now - MOVOU X9, 208(CX) + VMOVDQU Y9, 416(CX) // t12=^(b6 ^ t9) - PXOR X11, X7 - PANDN X0, X7 + VPXOR Y11, Y7, Y7 + VPANDN Y0, Y7, Y7 // store m9, can reuse t7 t8 t9 now - MOVOU X7, 272(CX) + VMOVDQU Y7, 544(CX) // t10=t6 ^ t7 - PXOR X10, X3 + VPXOR Y10, Y3, Y3 // store g0, can reuse t6 now - MOVOU X3, (CX) + VMOVDQU Y3, (CX) // t13=t4 ^ t10 - PXOR X6, X3 + VPXOR Y6, Y3, Y3 // store g2, can reuse t4 now - MOVOU X3, 32(CX) + VMOVDQU Y3, 64(CX) // t14=t2 ^ t11 - MOVOU X9, X1 - PXOR X2, X1 + VPXOR Y2, Y9, Y1 // store g6, can reuse t2 now - MOVOU X1, 96(CX) + VMOVDQU Y1, 192(CX) // t15=t12^t14 - PXOR X7, X1 + VPXOR Y7, Y1, Y1 // store g7 - MOVOU X1, 112(CX) + VMOVDQU Y1, 224(CX) // t16=t3 ^ t12 - PXOR X5, X7 + VPXOR Y5, Y7, Y7 // store m3 - MOVOU X7, 176(CX) + VMOVDQU Y7, 352(CX) // t17=t11 ^ t16 - PXOR X9, X7 + VPXOR Y9, Y7, Y7 // store m7 - MOVOU X7, 240(CX) + VMOVDQU Y7, 480(CX) // Start top function // Current register status: t17=t16=t12=m7, t11=m5, t15=t14=t1=g7, t13=t10=t7=g2, t4=m4, t8=m2, t3=m1, t6=m0, t2=g4, t5=g3,t9=m8 - // t2=^(m0 & m1) - PAND X10, X5 - PANDN X0, X5 + // t2= (m0 & m1) + VPAND Y10, Y5, Y5 - // t3=^(g0 & g4) - PAND (CX), X2 - PANDN X0, X2 + // t3= (g0 & g4) + VPAND (CX), Y2, Y2 - // t4=^(g3 & g7) - MOVOU X1, X10 - PAND X8, X1 - PANDN X0, X1 + // t4= (g3 & g7) + VPAND Y8, Y1, Y10 - // t7=^(g3 | g7) - POR X10, X8 - PANDN X0, X8 + // t7= (g3 | g7) + VPOR Y1, Y8, Y8 - // t11=^(m4 & m5) - PAND X6, X9 - PANDN X0, X9 - MOVOU 176(CX), X6 - MOVOU X6, X10 + // t11= (m4 & m5) + VPAND Y6, Y9, Y9 + VMOVDQU 352(CX), Y6 - // t10=^( m3 & m2 ) - PAND X4, X10 - PANDN X0, X10 + // t10= ( m3 & m2 ) + VPAND Y4, Y6, Y1 - // t12=^( m3 | m2 ) - POR X4, X6 - PANDN X0, X6 + // t12= ( m3 | m2 ) + VPOR Y4, Y6, Y6 - // t6=^( g6 | g2 ) - POR 96(CX), X3 - PANDN X0, X3 + // t6= ( g6 | g2 ) + VPOR 192(CX), Y3, Y3 - // t9=^( m6 | m7 ) - POR 224(CX), X7 - PANDN X0, X7 - MOVOU 272(CX), X4 - MOVOU X4, X12 + // t9= ( m6 | m7 ) + VPOR 448(CX), Y7, Y7 + VMOVDQU 544(CX), Y12 - // t5=^( m8 & m9 ) - PAND X11, X4 - PANDN X0, X4 + // t5= ( m8 & m9 ) + VPAND Y11, Y12, Y4 - // t8=^( m8 | m9 ) - POR X11, X12 - PANDN X0, X12 + // t8= ( m8 | m9 ) + VPOR Y11, Y12, Y12 // t14 = t3 ^ t2 - PXOR X5, X2 + VPXOR Y5, Y2, Y2 // t16 = t5 ^ t14 - PXOR X2, X4 + VPXOR Y2, Y4, Y4 // t20 = t16 ^ t7 - PXOR X4, X8 + VPXOR Y4, Y8, Y8 // t17 = t9 ^ t10 - PXOR X7, X10 + VPXOR Y7, Y1, Y1 // t18 = t11 ^ t12 - PXOR X9, X6 + VPXOR Y9, Y6, Y6 // p2 = t20 ^ t18 - PXOR X8, X6 + VPXOR Y8, Y6, Y6 // p0 = t6 ^ t16 - PXOR X3, X4 + VPXOR Y3, Y4, Y4 - // t1 = ^(g5 & g1) - MOVOU 16(CX), X2 - MOVOU 80(CX), X8 - PAND X2, X8 - PANDN X0, X8 + // t1 = (g5 & g1) + VMOVDQU 32(CX), Y2 + VMOVDQU 160(CX), Y8 + VPAND Y2, Y8, Y8 // t13 = t1 ^ t2 - PXOR X8, X5 + VPXOR Y8, Y5, Y5 // t15 = t13 ^ t4 - PXOR X1, X5 + VPXOR Y10, Y5, Y5 // t19 = t6 ^ t15 - PXOR X5, X3 + VPXOR Y5, Y3, Y3 // p3 = t19 ^ t17 - PXOR X10, X3 + VPXOR Y1, Y3, Y3 // p1 = t8 ^ t15 - PXOR X12, X5 + VPXOR Y12, Y5, Y5 // start middle function // current register status: t8=p0, t3=p1, t4=p2, t7=p0 - // t1 = ^(p3 & p0) - MOVOU X4, X1 - PAND X3, X1 - PANDN X0, X1 + // t0 = (p1 & p2) + VPAND Y5, Y6, Y1 - // t2 = ^(t1 | p2) - MOVOU X6, X2 - POR X1, X2 - PANDN X0, X2 + // t1 = (p3 & p0) + VPAND Y3, Y4, Y2 - // t3 = ^(p2 & p0) - MOVOU X6, X8 - PAND X4, X6 - PANDN X0, X6 + // t2 = (p0 & p2) + VPAND Y6, Y4, Y8 - // t4 = p1 ^ t3 - PXOR X5, X6 + // t3 = (p1 & p3) + VPAND Y5, Y3, Y10 - // t5 = ^(p2 | t4) - MOVOU X8, X11 - POR X6, X8 - PANDN X0, X8 + // t4 = (t0 & t2) + VPAND Y1, Y8, Y11 - // t6 = ^(p1 & t4) - MOVOU X5, X10 - PAND X6, X10 - PANDN X0, X10 + // t5 = (t1 ^ t3) + VPXOR Y2, Y10, Y12 - // t7 = ^(p3 | t4) - MOVOU X3, X9 - POR X6, X3 - PANDN X0, X3 + // t6 = (t5 | p0) + VPOR Y12, Y4, Y4 - // t8 = ^(t7 | t2) - MOVOU X4, X7 - MOVOU X3, X4 - POR X2, X4 - PANDN X0, X4 + // t7 = (t2 | p3) + VPOR Y8, Y3, Y3 - // t9 = ^(t7 ^ t5) - PXOR X8, X3 - PANDN X0, X3 + // t8 = (t4 ^ t6) + VPXOR Y11, Y4, Y4 - // t10 = ^(t9 ^ p3) - PXOR X3, X9 - PANDN X0, X9 + // t9 = (t7 ^ t3) + VPXOR Y10, Y3, Y11 - // t11 = ^(t6 & t8) - PAND X4, X10 - PANDN X0, X10 + // t10 = (t0 ^ t9) + VPXOR Y1, Y11, Y1 - // t12 = ^(p1 & t8) - PAND X4, X5 - PANDN X0, X5 + // t11 = (t5 | p2) + VPOR Y6, Y12, Y6 - // t13 = ^(t12 ^ p0) - PXOR X5, X7 - PANDN X0, X7 + // l1 = t11 ^ t1 + VPXOR Y2, Y6, Y2 - // t14 = ^(t1 & p2) - PAND X1, X11 - PANDN X0, X11 + // t12 = (t2 | p1) + VPOR Y8, Y5, Y5 - // t15 = ^(t14 & t9) - PAND X11, X3 - PANDN X0, X3 + // l2 = t12 ^ t5 + VPXOR Y5, Y12, Y5 // start bottom function - // current register status: t11=l0, t7=l1, t6=l2, t12=l3 + // current register status: t1=l0, t2=l1, t3=l2, t8=l3 // k4 = l2 ^ l3 - MOVOU X7, X8 - PXOR X10, X8 + VPXOR Y5, Y4, Y3 // k3 = l1 ^ l3 - MOVOU X7, X6 - PXOR X3, X6 + VPXOR Y4, Y2, Y10 // k2 = l0 ^ l2 - MOVOU X10, X5 - PXOR X9, X5 + VPXOR Y1, Y5, Y8 // k0 = l0 ^ l1 - MOVOU X3, X1 - PXOR X9, X1 + VPXOR Y1, Y2, Y6 // k1 = k2 ^ k3 - MOVOU X6, X2 - PXOR X5, X2 + VPXOR Y8, Y10, Y11 - // e0=^(m1 & k0) - MOVOU 144(CX), X4 - PAND X1, X4 - PANDN X0, X4 + // e0= (m1 & k0) + VMOVDQU 288(CX), Y12 + VPAND Y6, Y12, Y12 - // e1=^(g5 & l1) - MOVOU 80(CX), X11 - PAND X3, X11 - PANDN X0, X11 + // e1= (g5 & l1) + VMOVDQU 160(CX), Y9 + VPAND Y2, Y9, Y9 // r0=e0 ^ e1 - PXOR X11, X4 + VPXOR Y9, Y12, Y12 - // e2=^(g4 & l0) - MOVOU 64(CX), X12 - PAND X9, X12 - PANDN X0, X12 + // e2=(g4 & l0) + VMOVDQU 128(CX), Y7 + VPAND Y1, Y7, Y7 // r1=e2 ^ e1 - PXOR X12, X11 + VPXOR Y7, Y9, Y9 // store r0 r1 - MOVOU X4, 352(CX) - MOVOU X11, 368(CX) + VMOVDQU Y12, 704(CX) + VMOVDQU Y9, 736(CX) - // e3=^(m7 & k3) - MOVOU 240(CX), X4 - PAND X6, X4 - PANDN X0, X4 + // e3= (m7 & k3) + VMOVDQU 480(CX), Y12 + VPAND Y10, Y12, Y12 - // e4=^(m5 & k2) - MOVOU 208(CX), X11 - PAND X5, X11 - PANDN X0, X11 + // e4= (m5 & k2) + VMOVDQU 416(CX), Y9 + VPAND Y8, Y9, Y9 // r2=e3 ^ e4 - PXOR X11, X4 + VPXOR Y9, Y12, Y12 - // e5=^(m3 & k1) - MOVOU 176(CX), X12 - PAND X2, X12 - PANDN X0, X12 + // e5= (m3 & k1) + VMOVDQU 352(CX), Y7 + VPAND Y11, Y7, Y7 // r3=e5 ^ e4 - PXOR X12, X11 + VPXOR Y7, Y9, Y9 // store r2 r3 - MOVOU X4, 384(CX) - MOVOU X11, 400(CX) + VMOVDQU Y12, 768(CX) + VMOVDQU Y9, 800(CX) - // e6=^(m9 & k4) - MOVOU 272(CX), X4 - PAND X8, X4 - PANDN X0, X4 + // e6=(m9 & k4) + VMOVDQU 544(CX), Y12 + VPAND Y3, Y12, Y12 - // e7=^(g7 & l3) - MOVOU 112(CX), X11 - PAND X7, X11 - PANDN X0, X11 + // e7=(g7 & l3) + VMOVDQU 224(CX), Y9 + VPAND Y4, Y9, Y9 // r4=e7 ^ e6 - PXOR X11, X4 + VPXOR Y9, Y12, Y12 - // e8=^(g6 & l2) - MOVOU 96(CX), X12 - PAND X10, X12 - PANDN X0, X12 + // e8=(g6 & l2) + VMOVDQU 192(CX), Y7 + VPAND Y5, Y7, Y7 // r5=e8 ^ e6 - PXOR X12, X11 + VPXOR Y7, Y9, Y7 - // store r4 r5 - MOVOU X4, 416(CX) - MOVOU X11, 432(CX) + // store r4 + VMOVDQU Y12, 832(CX) - // e9=^(m0 & k0) - MOVOU 128(CX), X4 - PAND X1, X4 - PANDN X0, X4 + // e9=(m0 & k0) + VMOVDQU 256(CX), Y12 + VPAND Y6, Y12, Y12 - // e10=^(g1 & l1) - MOVOU 16(CX), X1 - PAND X3, X1 - PANDN X0, X1 + // e10=(g1 & l1) + VMOVDQU 32(CX), Y6 + VPAND Y6, Y2, Y2 // r6=e9 ^ e10 - PXOR X1, X4 + VPXOR Y12, Y2, Y12 - // e11=^(g0 & l0) - MOVOU (CX), X12 - PAND X9, X12 - PANDN X0, X12 + // e11=(g0 & l0) + VMOVDQU (CX), Y9 + VPAND Y1, Y9, Y9 // r7=e11 ^ e10 - PXOR X12, X1 + VPXOR Y9, Y2, Y1 - // store r6 - MOVOU X4, 448(CX) + // e12=(m6 & k3) + VMOVDQU 448(CX), Y2 + VPAND Y2, Y10, Y2 - // e12=^(m6 & k3) - MOVOU 224(CX), X3 - PAND X6, X3 - PANDN X0, X3 - - // e13=^(m4 & k2) - MOVOU 192(CX), X9 - PAND X5, X9 - PANDN X0, X9 + // e13=(m4 & k2) + VMOVDQU 384(CX), Y10 + VPAND Y10, Y8, Y8 // r8=e12 ^ e13 - PXOR X9, X3 + VPXOR Y2, Y8, Y2 - // e14=^(m2 & k1) - MOVOU 160(CX), X12 - PAND X2, X12 - PANDN X0, X12 + // e14=(m2 & k1) + VMOVDQU 320(CX), Y10 + VPAND Y10, Y11, Y10 // r9=e14 ^ e13 - PXOR X12, X9 + VPXOR Y8, Y10, Y6 - // e15=^(m8 & k4) - MOVOU 256(CX), X4 - PAND X8, X4 - PANDN X0, X4 + // e15=(m8 & k4) + VMOVDQU 512(CX), Y11 + VPAND Y11, Y3, Y11 - // e16=^(g3 & l3) - MOVOU 48(CX), X11 - PAND X7, X11 - PANDN X0, X11 + // e16=(g3 & l3) + VMOVDQU 96(CX), Y3 + VPAND Y3, Y4, Y4 // r10=e15 ^ e16 - PXOR X11, X4 + VPXOR Y11, Y4, Y8 - // e17=^(g2 & l2) - MOVOU 32(CX), X12 - PAND X10, X12 - PANDN X0, X12 + // e17=(g2 & l2) + VMOVDQU 64(CX), Y11 + VPAND Y5, Y11, Y5 // r11=e17 ^ e16 - PXOR X12, X11 + VPXOR Y5, Y4, Y5 // start output function // [t1]=r7 ^ r9 - PXOR X1, X9 - - // t2=t1 ^ r1 - MOVOU 368(CX), X2 - PXOR X9, X2 - - // t3=t2 ^ r3 - MOVOU 400(CX), X5 - MOVOU X5, X6 - PXOR X2, X5 - - // t4=r5 ^ r3 - PXOR 432(CX), X6 - MOVOU 416(CX), X8 - MOVOU X8, X10 - - // t5=r4 ^ t4 - PXOR X6, X8 - - // t6=r0 ^ t4 - PXOR 352(CX), X10 - - // [t7]=r11 ^ r7 - PXOR X11, X1 - - // [t8]=[t1] ^ t4 - PXOR X9, X6 - - // store t8 - MOVOU X6, 80(AX) - - // [t9]=[t1] ^ t6 - PXOR X10, X9 - - // store t9 - MOVOU X9, 32(AX) - - // [t10]=r2 ^ t5 - PXOR 384(CX), X8 - - // [t11]=r10 ^ r8 - PXOR X4, X3 - - // store t11 - MOVOU X3, 48(AX) - - // [t12]=^(t3 ^ [t11]) - PXOR X5, X3 - PANDN X0, X3 - - // store t12 - MOVOU X3, 16(AX) - - // [t13]=[t10] ^ [t12] - PXOR X3, X8 - - // store t13 - MOVOU X8, 96(AX) - - // [t14]=^(t3 ^ [t7]) - PXOR X5, X1 - PANDN X0, X1 - - // store t14 - MOVOU X1, 64(AX) - - // [t16]=t6 ^ [t14] - PXOR X10, X1 - - // store t16 - MOVOU X1, (AX) - - // [t15]=^(r10 ^ r6) - PXOR 448(CX), X4 - PANDN X0, X4 - - // store t15 - MOVOU X4, 112(AX) - RET + VPXOR Y1, Y6, Y6 -// func l128(x *byte, buffer *byte) -// Requires: SSE2 -TEXT ·l128(SB), NOSPLIT, $0-16 - MOVQ x+0(FP), AX - MOVQ buffer+8(FP), CX - MOVOU (AX), X0 - MOVOU 128(AX), X1 - MOVOU 256(AX), X2 - MOVOU 384(AX), X3 - MOVOU 288(AX), X5 - MOVOU 352(AX), X6 - MOVOU 416(AX), X7 - MOVOU 480(AX), X8 - MOVOU 32(AX), X9 + // [t2]=t1 ^ r1 + VMOVDQU 736(CX), Y10 + VPXOR Y6, Y10, Y10 - // 0=0^24^14^22^30 - MOVOU X0, X4 - PXOR X3, X4 - PXOR 224(AX), X4 - PXOR X6, X4 - PXOR X8, X4 - MOVOU X4, (CX) + // [t3]=t2 ^ r3 + VMOVDQU 800(CX), Y4 + VPXOR Y10, Y4, Y3 - // 2=0^2^26^8^16 - MOVOU X0, X4 - PXOR X9, X4 - PXOR X7, X4 - PXOR X1, X4 - PXOR X2, X4 - MOVOU X4, 32(CX) + // [t4]=r5 ^ r3 + VPXOR Y7, Y4, Y4 - // 8=0^8^22^30^6 - MOVOU X0, X4 - PXOR X1, X4 - PXOR X6, X4 - PXOR X8, X4 - PXOR 96(AX), X4 - MOVOU X4, 128(CX) + // [t5]=r4 ^ [t4] + VMOVDQU 832(CX), Y9 + VPXOR Y4, Y9, Y11 - // 18=0^18^10^16^24 - MOVOU X0, X4 - PXOR X5, X4 - PXOR 160(AX), X4 - PXOR X2, X4 - PXOR X3, X4 - MOVOU X4, 288(CX) + // [t6]=r0 ^ r4 + VPXOR 704(CX), Y9, Y9 - // 26=0^26^18^24^8 - PXOR X1, X0 - PXOR X7, X0 - PXOR X5, X0 - PXOR X3, X0 - MOVOU X0, 416(CX) + // [t7]=r11 ^ r7 + VPXOR Y5, Y1, Y1 - // 10=10^2^8^16^24 - MOVOU X9, X4 - PXOR 160(AX), X4 - PXOR X1, X4 - PXOR X2, X4 - PXOR X3, X4 - MOVOU X4, 160(CX) - MOVOU 96(AX), X0 - MOVOU 224(AX), X5 + // [t8]=[t1] ^ [t4] + VPXOR Y6, Y4, Y4 - // 16=16^8^30^6^14 - PXOR X2, X1 - PXOR X8, X1 - PXOR X0, X1 - PXOR X5, X1 - MOVOU X1, 256(CX) + // store t8 + VMOVDQU Y4, 160(AX) - // 24=24^16^6^14^22 - PXOR X3, X2 - PXOR X0, X2 - PXOR X5, X2 - PXOR X6, X2 - MOVOU X2, 384(CX) - MOVOU 64(AX), X1 - MOVOU 160(AX), X2 - MOVOU 192(AX), X3 + // [t9]=[t1] ^ [t6] + VPXOR Y9, Y6, Y6 - // 4=4^28^2^10^18 - MOVOU X9, X4 - PXOR X1, X4 - PXOR X2, X4 - PXOR 288(AX), X4 - PXOR 448(AX), X4 - MOVOU X4, 64(CX) + // store t9 + VMOVDQU Y6, 64(AX) - // 20=20^12^18^26^2 - MOVOU X9, X4 - PXOR 320(AX), X4 - PXOR X3, X4 - PXOR 288(AX), X4 - PXOR X7, X4 - MOVOU X4, 320(CX) + // [t10]=r2 ^ t5 + VPXOR 768(CX), Y11, Y11 - // 28=28^20^26^2^10 - PXOR 448(AX), X9 - PXOR 320(AX), X9 - PXOR X7, X9 - PXOR X2, X9 - MOVOU X9, 448(CX) - MOVOU 320(AX), X9 + // [t11]=r10 ^ r8 + VPXOR Y8, Y2, Y2 - // 6=6^30^4^12^20 - MOVOU X1, X4 - PXOR X0, X4 - PXOR X3, X4 - PXOR X8, X4 - PXOR X9, X4 - MOVOU X4, 96(CX) + // store t11 + VMOVDQU Y2, 96(AX) - // 12=12^4^10^18^26 - MOVOU X1, X4 - PXOR X3, X4 - PXOR X2, X4 - PXOR 288(AX), X4 - PXOR X7, X4 - MOVOU X4, 192(CX) - MOVOU 448(AX), X7 + // [t12]=^([t3] ^ [t11]) + VPXOR Y3, Y2, Y2 + VPANDN Y0, Y2, Y2 - // 22=22^14^20^28^4 - MOVOU X1, X4 - PXOR X5, X4 - PXOR X6, X4 - PXOR X9, X4 - PXOR X7, X4 - MOVOU X4, 352(CX) + // store t12 + VMOVDQU Y2, 32(AX) - // 30=30^22^28^4^12 - PXOR X8, X1 - PXOR X6, X1 - PXOR X3, X1 - PXOR X7, X1 - MOVOU X1, 480(CX) + // [t13]=[t10] ^ [t12] + VPXOR Y2, Y11, Y11 - // 14=14^6^12^20^28 - PXOR X3, X0 - PXOR X7, X0 - PXOR X9, X0 - PXOR X5, X0 - MOVOU X0, 224(CX) - MOVOU 16(AX), X0 - MOVOU 144(AX), X1 - MOVOU 272(AX), X2 - MOVOU 400(AX), X3 - MOVOU 304(AX), X5 - MOVOU 368(AX), X6 - MOVOU 432(AX), X7 - MOVOU 496(AX), X8 - MOVOU 48(AX), X9 + // store t13 + VMOVDQU Y11, 192(AX) - // 1=1^25^15^23^31 - MOVOU X0, X4 - PXOR X3, X4 - PXOR 240(AX), X4 - PXOR X6, X4 - PXOR X8, X4 - MOVOU X4, 16(CX) + // [t14]=^([t3] ^ [t7]) + VPXOR Y3, Y1, Y1 + VPANDN Y0, Y1, Y1 - // 3=3^27^1^9^17 - MOVOU X0, X4 - PXOR X9, X4 - PXOR X7, X4 - PXOR X1, X4 - PXOR X2, X4 - MOVOU X4, 48(CX) + // store t14 + VMOVDQU Y1, 128(AX) - // 9=9^1^23^31^7 - MOVOU X0, X4 - PXOR X1, X4 - PXOR X6, X4 - PXOR X8, X4 - PXOR 112(AX), X4 - MOVOU X4, 144(CX) + // [t16]=[t6] ^ [t14] + VPXOR Y9, Y1, Y1 - // 19=1^19^11^17^25 - MOVOU X0, X4 - PXOR X5, X4 - PXOR 176(AX), X4 - PXOR X2, X4 - PXOR X3, X4 - MOVOU X4, 304(CX) + // store t16 + VMOVDQU Y1, (AX) - // 27=1^27^19^25^9 - PXOR X1, X0 - PXOR X7, X0 - PXOR X5, X0 - PXOR X3, X0 - MOVOU X0, 432(CX) + // [t15]=^(r10 ^ r6) + VPXOR Y12, Y8, Y8 + VPANDN Y0, Y8, Y8 - // 11=11^3^9^17^25 - MOVOU X9, X4 - PXOR 176(AX), X4 - PXOR X1, X4 - PXOR X2, X4 - PXOR X3, X4 - MOVOU X4, 176(CX) - MOVOU 112(AX), X0 - MOVOU 240(AX), X5 + // store t15 + VMOVDQU Y8, 224(AX) + VZEROUPPER + RET - // 17=17^9^31^7^15 - PXOR X2, X1 - PXOR X8, X1 - PXOR X0, X1 - PXOR X5, X1 - MOVOU X1, 272(CX) +// func xorRoundKey256avx2(rk uint32, x1 *byte, x2 *byte, x3 *byte, out *byte) +// Requires: AVX, AVX2 +TEXT ·xorRoundKey256avx2(SB), NOSPLIT, $0-40 + MOVL rk+0(FP), AX + MOVQ x1+8(FP), CX + MOVQ x2+16(FP), DX + MOVQ x3+24(FP), BX + MOVQ out+32(FP), SI + VPCMPEQB Y1, Y1, Y1 + XORQ R8, R8 - // 25=25^17^7^15^23 - PXOR X3, X2 - PXOR X0, X2 - PXOR X5, X2 - PXOR X6, X2 - MOVOU X2, 400(CX) - MOVOU 80(AX), X1 - MOVOU 176(AX), X2 - MOVOU 208(AX), X3 + // Handle first byte + MOVL $0x01000000, DI - // 5=5^29^3^11^19 - MOVOU X9, X4 - PXOR X1, X4 - PXOR X2, X4 - PXOR 304(AX), X4 - PXOR 464(AX), X4 - MOVOU X4, 80(CX) +rk_loop_1: + VMOVDQU (CX)(R8*1), Y0 + VPXOR (DX)(R8*1), Y0, Y0 + VPXOR (BX)(R8*1), Y0, Y0 + TESTL AX, DI + JZ rk_loop_1_c + VPXOR Y1, Y0, Y0 - // 21=21^13^19^27^3 - MOVOU X9, X4 - PXOR 336(AX), X4 - PXOR X3, X4 - PXOR 304(AX), X4 - PXOR X7, X4 - MOVOU X4, 336(CX) +rk_loop_1_c: + VMOVDQU Y0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x20, R8 + CMPQ R8, $0x00000100 + JL rk_loop_1 - // 29=29^21^27^3^11 - PXOR 464(AX), X9 - PXOR 336(AX), X9 - PXOR X7, X9 - PXOR X2, X9 - MOVOU X9, 464(CX) - MOVOU 336(AX), X9 + // Handle second byte + MOVL $0x00010000, DI - // 7=7^31^5^13^21 - MOVOU X1, X4 - PXOR X0, X4 - PXOR X3, X4 - PXOR X8, X4 - PXOR X9, X4 - MOVOU X4, 112(CX) +rk_loop_2: + VMOVDQU (CX)(R8*1), Y0 + VPXOR (DX)(R8*1), Y0, Y0 + VPXOR (BX)(R8*1), Y0, Y0 + TESTL AX, DI + JZ rk_loop_2_c + VPXOR Y1, Y0, Y0 - // 13=13^5^11^19^27 - MOVOU X1, X4 - PXOR X3, X4 - PXOR X2, X4 - PXOR 304(AX), X4 - PXOR X7, X4 - MOVOU X4, 208(CX) - MOVOU 464(AX), X7 +rk_loop_2_c: + VMOVDQU Y0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x20, R8 + CMPQ R8, $0x00000200 + JL rk_loop_2 - // 23=23^15^21^29^5 - MOVOU X1, X4 - PXOR X5, X4 - PXOR X6, X4 - PXOR X9, X4 - PXOR X7, X4 - MOVOU X4, 368(CX) + // Handle third byte + MOVL $0x00000100, DI - // 31=31^23^29^5^13 - PXOR X8, X1 - PXOR X6, X1 - PXOR X3, X1 - PXOR X7, X1 - MOVOU X1, 496(CX) +rk_loop_3: + VMOVDQU (CX)(R8*1), Y0 + VPXOR (DX)(R8*1), Y0, Y0 + VPXOR (BX)(R8*1), Y0, Y0 + TESTL AX, DI + JZ rk_loop_3_c + VPXOR Y1, Y0, Y0 - // 15=15^7^13^21^29 - PXOR X3, X0 - PXOR X7, X0 - PXOR X9, X0 - PXOR X5, X0 - MOVOU X0, 240(CX) +rk_loop_3_c: + VMOVDQU Y0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x20, R8 + CMPQ R8, $0x00000300 + JL rk_loop_3 + + // Handle last byte + MOVL $0x00000001, DI + +rk_loop_4: + VMOVDQU (CX)(R8*1), Y0 + VPXOR (DX)(R8*1), Y0, Y0 + VPXOR (BX)(R8*1), Y0, Y0 + TESTL AX, DI + JZ rk_loop_4_c + VPXOR Y1, Y0, Y0 + +rk_loop_4_c: + VMOVDQU Y0, (SI)(R8*1) + ROLL $0x01, DI + ADDQ $0x20, R8 + CMPQ R8, $0x00000400 + JL rk_loop_4 RET diff --git a/transpose_amd64_test.go b/transpose_amd64_test.go index 1d1bbe5..6fa55db 100644 --- a/transpose_amd64_test.go +++ b/transpose_amd64_test.go @@ -102,3 +102,50 @@ func BenchmarkBS128TransposeRevAvx(b *testing.B) { transpose128RevAvx(&input[0], &out[0]) } } + +func TestBS256TransposeRev(t *testing.T) { + in := make([]byte, 256*16) + ret := make([]byte, 256*16) + out := make([]byte, 256*16) + + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + for i := 0; i < 256; i++ { + copy(in[i*16:], key) + } + + transpose256avx(&in[0], &out[0]) + transpose128x256avx2(&out[0], &ret[0]) + if !bytes.Equal(in, ret) { + t.Fatalf("not expected %x", ret[:16]) + } +} + +func BenchmarkBS256TransposeAvx(b *testing.B) { + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + input := make([]byte, 256*16) + for i := 0; i < 256; i++ { + copy(input[i*16:], key) + } + out := make([]byte, 256*16) + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + transpose256avx(&input[0], &out[0]) + } +} + +func BenchmarkBS256TransposeRevAvx(b *testing.B) { + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} + input := make([]byte, 256*16) + for i := 0; i < 256; i++ { + copy(input[i*16:], key) + } + out := make([]byte, 256*16) + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + transpose256RevAvx(&input[0], &out[0]) + } +}