diff --git a/include/fast_io_crypto/hash/sha512_simd32_shuffle.h b/include/fast_io_crypto/hash/sha512_simd32_shuffle.h new file mode 100644 index 00000000..05e0495d --- /dev/null +++ b/include/fast_io_crypto/hash/sha512_simd32_shuffle.h @@ -0,0 +1,408 @@ +#pragma once + +#if __has_cpp_attribute(__gnu__::__always_inline__) +[[__gnu__::__always_inline__]] +#endif +inline void sha512_simd32_byte_swap_message_4rounds(::fast_io::intrinsics::simd_vector& __restrict s1, + std::byte const* __restrict blocks_start,std::uint_least64_t* __restrict w,std::uint_least64_t* __restrict wt,std::size_t round) noexcept +{ + using namespace ::fast_io::intrinsics; + simd_vector s0; + s1.load(blocks_start+(round*8u)); + s0.load(K512+round); + if constexpr(std::endian::native==std::endian::little) + { + s1.swap_endian(); + } + s1.store(w+round); + s0.wrap_add_assign(s1); + s0.store(wt); +} + +#if __has_cpp_attribute(__gnu__::__always_inline__) +[[__gnu__::__always_inline__]] +#endif +inline void sha512_simd32_compute_message_4rounds( + ::fast_io::intrinsics::simd_vector& __restrict s1, + std::uint_least64_t* __restrict w,std::uint_least64_t* __restrict wt + ,std::uint_least64_t* __restrict wt1,std::size_t round, + + std::uint_least64_t& a,std::uint_least64_t& b, + std::uint_least64_t& c,std::uint_least64_t& d, + std::uint_least64_t& e,std::uint_least64_t& f, + std::uint_least64_t& g,std::uint_least64_t& h, + std::uint_least64_t& bpc + ) noexcept +{ +/* +s0(w[t])=(w[t-15]>>1) ^ (w[t-15]<<63)^ (w[t-15]>>8) ^ (w[t-15]<<56) ^ (w[t-15]>>7) +s1(w[t])=(w[t-2]>>19) ^ (w[t-2]<<45) ^ (w[t-2]>>61) ^ (w[t-2]<<3) ^ (w[t-2]>>6) +w[i] = w[i-16] + w[i-7] + s0[i] + s1[i] +wt[r,r+1,r+2,r+3]=wt[r-16,r-15,r-14,r-13]+wt[r-7,r-6,r-5,r-4] +*/ + + using namespace ::fast_io::intrinsics; + simd_vector s0,s2,s0temp,s0temp2; + simd_vector s2low,s3,s1temp,s1temp2; + std::uint_least64_t temp,temp1,T1k,apb; + if constexpr(false) + { + s0.load(w+(round-15)); + s1temp=s1>>19; + s1temp2=s1<<45; + s1temp^=s1temp2; + s1temp2=s1>>61; + s1temp^=s1temp2; + s1temp2=s1<<3; + s1temp^=s1temp2; + s1temp2=s1>>6; + s1=s1temp^s1temp2; + s0temp=s0>>1; + s0temp2=s0<<63; + s0temp^=s0temp2; + s0temp2=s0>>8; + s0temp^=s0temp2; + s0temp2=s0<<56; + s0temp^=s0temp2; + s0temp2=s0>>7; + s0=s0temp^s0temp2; + s2.load(w+(round-16)); + s2.wrap_add_assign(s0); + s0.load(w+(round-7)); + s2.wrap_add_assign(s0); + s2low.value=__builtin_shufflevector(s2.value,s2.value,0,1); + s2low.wrap_add_assign(s1); + s1.value=__builtin_shufflevector(s2.value,s2.value,2,3); + s1temp=s2low>>19; + s1temp2=s2low<<45; + s1temp^=s1temp2; + s1temp2=s2low>>61; + s1temp^=s1temp2; + s1temp2=s2low<<3; + s1temp^=s1temp2; + s1temp2=s2low>>6; + s3=s1temp^s1temp2; + s1.wrap_add_assign(s3); + s2.value=__builtin_shufflevector(s2low.value,s1.value,0,1,2,3); + s2.store(w+round); + s0.load(::fast_io::details::sha512::K512+round); + s0.wrap_add_assign(s2); + s0.store(wt1); + + T1k=h; + T1k+=wt[0]; + temp=std::rotr(e,14); + temp1=std::rotr(e,18); + temp^=temp1; + temp1=std::rotr(e,41); + temp^=temp1; + T1k+=temp; + temp=f^g; + temp&=e; + temp^=g; + T1k+=temp; + temp=std::rotr(a,28); + temp1=std::rotr(a,34); + temp^=temp1; + temp1=std::rotr(a,39); + h=temp^temp1; + apb=a^b; + temp=apb&bpc; + bpc=apb; + temp^=b; + h+=temp; + bpc=apb; + d+=T1k; + h+=T1k; + + T1k=g; + T1k+=wt[1]; + temp=std::rotr(d,14); + temp1=std::rotr(d,18); + temp^=temp1; + temp1=std::rotr(d,41); + temp^=temp1; + T1k+=temp; + temp=e^f; + temp&=d; + temp^=f; + T1k+=temp; + temp=std::rotr(h,28); + temp1=std::rotr(h,34); + temp^=temp1; + temp1=std::rotr(h,39); + g=temp^temp1; + apb=h^a; + temp=apb&bpc; + bpc=apb; + temp^=a; + g+=temp; + bpc=apb; + c+=T1k; + g+=T1k; + + T1k=f; + T1k+=wt[2]; + temp=std::rotr(c,14); + temp1=std::rotr(c,18); + temp^=temp1; + temp1=std::rotr(c,41); + temp^=temp1; + T1k+=temp; + temp=d^e; + temp&=c; + temp^=e; + T1k+=temp; + temp=std::rotr(g,28); + temp1=std::rotr(g,34); + temp^=temp1; + temp1=std::rotr(g,39); + f=temp^temp1; + apb=g^h; + temp=apb&bpc; + bpc=apb; + temp^=h; + f+=temp; + bpc=apb; + b+=T1k; + f+=T1k; + + T1k=e; + T1k+=wt[3]; + temp=std::rotr(b,14); + temp1=std::rotr(b,18); + temp^=temp1; + temp1=std::rotr(b,41); + temp^=temp1; + T1k+=temp; + temp=c^d; + temp&=b; + temp^=d; + T1k+=temp; + temp=std::rotr(f,28); + temp1=std::rotr(f,34); + temp^=temp1; + temp1=std::rotr(f,39); + e=temp^temp1; + apb=f^g; + temp=apb&bpc; + bpc=apb; + temp^=g; + e+=temp; + bpc=apb; + a+=T1k; + e+=T1k; + } + else + { + s0.load(w+(round-15)); + T1k=h; + T1k+=wt[0]; + s1temp=s1>>19; + temp=std::rotr(e,14); + temp1=std::rotr(e,18); + s1temp2=s1<<45; + temp^=temp1; + temp1=std::rotr(e,41); + s1temp^=s1temp2; + temp^=temp1; + T1k+=temp; + s1temp2=s1>>61; + temp=f^g; + temp&=e; + s1temp^=s1temp2; + temp^=g; + T1k+=temp; + s1temp2=s1<<3; + temp=std::rotr(a,28); + temp1=std::rotr(a,34); + s1temp^=s1temp2; + temp^=temp1; + temp1=std::rotr(a,39); + s1temp2=s1>>6; + h=temp^temp1; + apb=a^b; + s1=s1temp^s1temp2; + temp=apb&bpc; + bpc=apb; + s0temp=s0>>1; + temp^=b; + h+=temp; + s0temp2=s0<<63; + bpc=apb; + d+=T1k; + s0temp^=s0temp2; + h+=T1k; + T1k=g; + s0temp2=s0>>8; + T1k+=wt[1]; + temp=std::rotr(d,14); + s0temp^=s0temp2; + temp1=std::rotr(d,18); + temp^=temp1; + s0temp2=s0<<56; + temp1=std::rotr(d,41); + temp^=temp1; + s0temp^=s0temp2; + T1k+=temp; + temp=e^f; + s0temp2=s0>>7; + temp&=d; + temp^=f; + s0=s0temp^s0temp2; + T1k+=temp; + temp=std::rotr(h,28); + s2.load(w+(round-16)); + temp1=std::rotr(h,34); + temp^=temp1; + s2.wrap_add_assign(s0); + temp1=std::rotr(h,39); + g=temp^temp1; + s0.load(w+(round-7)); + apb=h^a; + temp=apb&bpc; + s2.wrap_add_assign(s0); + bpc=apb; + temp^=a; + s2low.value=__builtin_shufflevector(s2.value,s2.value,0,1); + g+=temp; + bpc=apb; + s2low.wrap_add_assign(s1); + c+=T1k; + g+=T1k; + s1.value=__builtin_shufflevector(s2.value,s2.value,2,3); + T1k=f; + T1k+=wt[2]; + s1temp=s2low>>19; + temp=std::rotr(c,14); + temp1=std::rotr(c,18); + s1temp2=s2low<<45; + temp^=temp1; + temp1=std::rotr(c,41); + s1temp^=s1temp2; + temp^=temp1; + T1k+=temp; + s1temp2=s2low>>61; + temp=d^e; + temp&=c; + s1temp^=s1temp2; + temp^=e; + T1k+=temp; + s1temp2=s2low<<3; + temp=std::rotr(g,28); + temp1=std::rotr(g,34); + s1temp^=s1temp2; + temp^=temp1; + temp1=std::rotr(g,39); + s1temp2=s2low>>6; + f=temp^temp1; + apb=g^h; + s3=s1temp^s1temp2; + temp=apb&bpc; + bpc=apb; + s1.wrap_add_assign(s3); + temp^=h; + f+=temp; + s2.value=__builtin_shufflevector(s2low.value,s1.value,0,1,2,3); + bpc=apb; + b+=T1k; + s2.store(w+round); + f+=T1k; + T1k=e; + s0.load(::fast_io::details::sha512::K512+round); + T1k+=wt[3]; + temp=std::rotr(b,14); + s0.wrap_add_assign(s2); + temp1=std::rotr(b,18); + temp^=temp1; + s0.store(wt1); + temp1=std::rotr(b,41); + temp^=temp1; + T1k+=temp; + temp=c^d; + temp&=b; + temp^=d; + T1k+=temp; + temp=std::rotr(f,28); + temp1=std::rotr(f,34); + temp^=temp1; + temp1=std::rotr(f,39); + e=temp^temp1; + apb=f^g; + temp=apb&bpc; + bpc=apb; + temp^=g; + e+=temp; + bpc=apb; + a+=T1k; + e+=T1k; + } +} + +#if __has_cpp_attribute(__gnu__::__flatten__) +[[__gnu__::__flatten__]] +#endif +inline void sha512_runtime_routine(std::uint_least64_t* __restrict state,std::byte const* __restrict blocks_start,std::byte const* __restrict blocks_last) noexcept +{ + using namespace fast_io::intrinsics; + using namespace fast_io::details::sha512; + + simd_vector simd; + + std::uint_least64_t wt0[4],wt1[4]; + std::uint_least64_t w[80]; + std::uint_least64_t a{state[0]}; + std::uint_least64_t b{state[1]}; + std::uint_least64_t c{state[2]}; + std::uint_least64_t d{state[3]}; + std::uint_least64_t e{state[4]}; + std::uint_least64_t f{state[5]}; + std::uint_least64_t g{state[6]}; + std::uint_least64_t h{state[7]}; + + for(;blocks_start!=blocks_last;blocks_start+=128) + { + sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt0,0); + sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt1,4); + std::uint_least64_t bpc{b^c}; + sha512_scalar_round(wt0[0],a,b,d,e,f,g,h,bpc); + sha512_scalar_round(wt0[1],h,a,c,d,e,f,g,bpc); + sha512_scalar_round(wt0[2],g,h,b,c,d,e,f,bpc); + sha512_scalar_round(wt0[3],f,g,a,b,c,d,e,bpc); + sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt0,8); + sha512_scalar_round(wt1[0],e,f,h,a,b,c,d,bpc); + sha512_scalar_round(wt1[1],d,e,g,h,a,b,c,bpc); + sha512_scalar_round(wt1[2],c,d,f,g,h,a,b,bpc); + sha512_scalar_round(wt1[3],b,c,e,f,g,h,a,bpc); + sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt1,12); + sha512_scalar_round(wt0[0],a,b,d,e,f,g,h,bpc); + sha512_scalar_round(wt0[1],h,a,c,d,e,f,g,bpc); + sha512_scalar_round(wt0[2],g,h,b,c,d,e,f,bpc); + sha512_scalar_round(wt0[3],f,g,a,b,c,d,e,bpc); + simd_vector simd2{__builtin_shufflevector(simd.value,simd.value,2,3)}; + for(std::uint_fast8_t i{16};i!=80;i+=16) + { + sha512_simd32_compute_message_4rounds(simd2,w,wt1,wt0,i, + e,f,g,h,a,b,c,d,bpc); + sha512_simd32_compute_message_4rounds(simd2,w,wt0,wt1,i+4, + a,b,c,d,e,f,g,h,bpc); + sha512_simd32_compute_message_4rounds(simd2,w,wt1,wt0,i+8, + e,f,g,h,a,b,c,d,bpc); + sha512_simd32_compute_message_4rounds(simd2,w,wt0,wt1,i+12, + a,b,c,d,e,f,g,h,bpc); + } + sha512_scalar_round(wt1[0],e,f,h,a,b,c,d,bpc); + sha512_scalar_round(wt1[1],d,e,g,h,a,b,c,bpc); + sha512_scalar_round(wt1[2],c,d,f,g,h,a,b,bpc); + sha512_scalar_round(wt1[3],b,c,e,f,g,h,a,bpc); + + a=(*state+=a); + b=(state[1]+=b); + c=(state[2]+=c); + d=(state[3]+=d); + e=(state[4]+=e); + f=(state[5]+=f); + g=(state[6]+=g); + h=(state[7]+=h); + } +} \ No newline at end of file