diff --git a/include/fast_io_crypto/hash/sha512_simd32_shuffle.h b/include/fast_io_crypto/hash/sha512_simd32_shuffle.h
new file mode 100644
index 00000000..05e0495d
--- /dev/null
+++ b/include/fast_io_crypto/hash/sha512_simd32_shuffle.h
@@ -0,0 +1,408 @@
+﻿#pragma once
+
+#if __has_cpp_attribute(__gnu__::__always_inline__)
+[[__gnu__::__always_inline__]]
+#endif
+inline void sha512_simd32_byte_swap_message_4rounds(::fast_io::intrinsics::simd_vector<std::uint_least64_t,4>& __restrict s1,
+	std::byte const* __restrict blocks_start,std::uint_least64_t* __restrict w,std::uint_least64_t* __restrict wt,std::size_t round) noexcept
+{
+	using namespace ::fast_io::intrinsics;
+	simd_vector<std::uint_least64_t,4> s0;
+	s1.load(blocks_start+(round*8u));
+	s0.load(K512+round);
+	if constexpr(std::endian::native==std::endian::little)
+	{
+		s1.swap_endian();
+	}
+	s1.store(w+round);
+	s0.wrap_add_assign(s1);
+	s0.store(wt);
+}
+
+#if __has_cpp_attribute(__gnu__::__always_inline__)
+[[__gnu__::__always_inline__]]
+#endif
+inline void sha512_simd32_compute_message_4rounds(
+	::fast_io::intrinsics::simd_vector<std::uint_least64_t,2>& __restrict s1,
+	std::uint_least64_t* __restrict w,std::uint_least64_t* __restrict wt
+	,std::uint_least64_t* __restrict wt1,std::size_t round,
+	
+	std::uint_least64_t& a,std::uint_least64_t& b,
+	std::uint_least64_t& c,std::uint_least64_t& d,
+	std::uint_least64_t& e,std::uint_least64_t& f,
+	std::uint_least64_t& g,std::uint_least64_t& h,
+	std::uint_least64_t& bpc
+	) noexcept
+{
+/*
+s0(w[t])=(w[t-15]>>1) ^ (w[t-15]<<63)^ (w[t-15]>>8) ^ (w[t-15]<<56) ^ (w[t-15]>>7)
+s1(w[t])=(w[t-2]>>19) ^ (w[t-2]<<45) ^ (w[t-2]>>61) ^ (w[t-2]<<3)   ^ (w[t-2]>>6)
+w[i] = w[i-16] + w[i-7] + s0[i] + s1[i]
+wt[r,r+1,r+2,r+3]=wt[r-16,r-15,r-14,r-13]+wt[r-7,r-6,r-5,r-4]
+*/
+
+	using namespace ::fast_io::intrinsics;
+	simd_vector<std::uint_least64_t,4> s0,s2,s0temp,s0temp2;
+	simd_vector<std::uint_least64_t,2> s2low,s3,s1temp,s1temp2;
+	std::uint_least64_t temp,temp1,T1k,apb;
+	if constexpr(false)
+	{
+		s0.load(w+(round-15));
+		s1temp=s1>>19;
+		s1temp2=s1<<45;
+		s1temp^=s1temp2;
+		s1temp2=s1>>61;
+		s1temp^=s1temp2;
+		s1temp2=s1<<3;
+		s1temp^=s1temp2;
+		s1temp2=s1>>6;
+		s1=s1temp^s1temp2;
+		s0temp=s0>>1;
+		s0temp2=s0<<63;
+		s0temp^=s0temp2;
+		s0temp2=s0>>8;
+		s0temp^=s0temp2;
+		s0temp2=s0<<56;
+		s0temp^=s0temp2;
+		s0temp2=s0>>7;
+		s0=s0temp^s0temp2;
+		s2.load(w+(round-16));
+		s2.wrap_add_assign(s0);
+		s0.load(w+(round-7));
+		s2.wrap_add_assign(s0);
+		s2low.value=__builtin_shufflevector(s2.value,s2.value,0,1);
+		s2low.wrap_add_assign(s1);
+		s1.value=__builtin_shufflevector(s2.value,s2.value,2,3);
+		s1temp=s2low>>19;
+		s1temp2=s2low<<45;
+		s1temp^=s1temp2;
+		s1temp2=s2low>>61;
+		s1temp^=s1temp2;
+		s1temp2=s2low<<3;
+		s1temp^=s1temp2;
+		s1temp2=s2low>>6;
+		s3=s1temp^s1temp2;
+		s1.wrap_add_assign(s3);
+		s2.value=__builtin_shufflevector(s2low.value,s1.value,0,1,2,3);
+		s2.store(w+round);
+		s0.load(::fast_io::details::sha512::K512+round);
+		s0.wrap_add_assign(s2);
+		s0.store(wt1);
+
+		T1k=h;
+		T1k+=wt[0];
+		temp=std::rotr(e,14);
+		temp1=std::rotr(e,18);
+		temp^=temp1;
+		temp1=std::rotr(e,41);
+		temp^=temp1;
+		T1k+=temp;
+		temp=f^g;
+		temp&=e;
+		temp^=g;
+		T1k+=temp;
+		temp=std::rotr(a,28);
+		temp1=std::rotr(a,34);
+		temp^=temp1;
+		temp1=std::rotr(a,39);
+		h=temp^temp1;
+		apb=a^b;
+		temp=apb&bpc;
+		bpc=apb;
+		temp^=b;
+		h+=temp;
+		bpc=apb;
+		d+=T1k;
+		h+=T1k;
+
+		T1k=g;
+		T1k+=wt[1];
+		temp=std::rotr(d,14);
+		temp1=std::rotr(d,18);
+		temp^=temp1;
+		temp1=std::rotr(d,41);
+		temp^=temp1;
+		T1k+=temp;
+		temp=e^f;
+		temp&=d;
+		temp^=f;
+		T1k+=temp;
+		temp=std::rotr(h,28);
+		temp1=std::rotr(h,34);
+		temp^=temp1;
+		temp1=std::rotr(h,39);
+		g=temp^temp1;
+		apb=h^a;
+		temp=apb&bpc;
+		bpc=apb;
+		temp^=a;
+		g+=temp;
+		bpc=apb;
+		c+=T1k;
+		g+=T1k;
+
+		T1k=f;
+		T1k+=wt[2];
+		temp=std::rotr(c,14);
+		temp1=std::rotr(c,18);
+		temp^=temp1;
+		temp1=std::rotr(c,41);
+		temp^=temp1;
+		T1k+=temp;
+		temp=d^e;
+		temp&=c;
+		temp^=e;
+		T1k+=temp;
+		temp=std::rotr(g,28);
+		temp1=std::rotr(g,34);
+		temp^=temp1;
+		temp1=std::rotr(g,39);
+		f=temp^temp1;
+		apb=g^h;
+		temp=apb&bpc;
+		bpc=apb;
+		temp^=h;
+		f+=temp;
+		bpc=apb;
+		b+=T1k;
+		f+=T1k;
+
+		T1k=e;
+		T1k+=wt[3];
+		temp=std::rotr(b,14);
+		temp1=std::rotr(b,18);
+		temp^=temp1;
+		temp1=std::rotr(b,41);
+		temp^=temp1;
+		T1k+=temp;
+		temp=c^d;
+		temp&=b;
+		temp^=d;
+		T1k+=temp;
+		temp=std::rotr(f,28);
+		temp1=std::rotr(f,34);
+		temp^=temp1;
+		temp1=std::rotr(f,39);
+		e=temp^temp1;
+		apb=f^g;
+		temp=apb&bpc;
+		bpc=apb;
+		temp^=g;
+		e+=temp;
+		bpc=apb;
+		a+=T1k;
+		e+=T1k;
+	}
+	else
+	{
+		s0.load(w+(round-15));
+		T1k=h;
+		T1k+=wt[0];
+		s1temp=s1>>19;
+		temp=std::rotr(e,14);
+		temp1=std::rotr(e,18);
+		s1temp2=s1<<45;
+		temp^=temp1;
+		temp1=std::rotr(e,41);
+		s1temp^=s1temp2;
+		temp^=temp1;
+		T1k+=temp;
+		s1temp2=s1>>61;
+		temp=f^g;
+		temp&=e;
+		s1temp^=s1temp2;
+		temp^=g;
+		T1k+=temp;
+		s1temp2=s1<<3;
+		temp=std::rotr(a,28);
+		temp1=std::rotr(a,34);
+		s1temp^=s1temp2;
+		temp^=temp1;
+		temp1=std::rotr(a,39);
+		s1temp2=s1>>6;
+		h=temp^temp1;
+		apb=a^b;
+		s1=s1temp^s1temp2;
+		temp=apb&bpc;
+		bpc=apb;
+		s0temp=s0>>1;
+		temp^=b;
+		h+=temp;
+		s0temp2=s0<<63;
+		bpc=apb;
+		d+=T1k;
+		s0temp^=s0temp2;
+		h+=T1k;
+		T1k=g;
+		s0temp2=s0>>8;
+		T1k+=wt[1];
+		temp=std::rotr(d,14);
+		s0temp^=s0temp2;
+		temp1=std::rotr(d,18);
+		temp^=temp1;
+		s0temp2=s0<<56;
+		temp1=std::rotr(d,41);
+		temp^=temp1;
+		s0temp^=s0temp2;
+		T1k+=temp;
+		temp=e^f;
+		s0temp2=s0>>7;
+		temp&=d;
+		temp^=f;
+		s0=s0temp^s0temp2;
+		T1k+=temp;
+		temp=std::rotr(h,28);
+		s2.load(w+(round-16));
+		temp1=std::rotr(h,34);
+		temp^=temp1;
+		s2.wrap_add_assign(s0);
+		temp1=std::rotr(h,39);
+		g=temp^temp1;
+		s0.load(w+(round-7));
+		apb=h^a;
+		temp=apb&bpc;
+		s2.wrap_add_assign(s0);
+		bpc=apb;
+		temp^=a;
+		s2low.value=__builtin_shufflevector(s2.value,s2.value,0,1);
+		g+=temp;
+		bpc=apb;
+		s2low.wrap_add_assign(s1);
+		c+=T1k;
+		g+=T1k;
+		s1.value=__builtin_shufflevector(s2.value,s2.value,2,3);
+		T1k=f;
+		T1k+=wt[2];
+		s1temp=s2low>>19;
+		temp=std::rotr(c,14);
+		temp1=std::rotr(c,18);
+		s1temp2=s2low<<45;
+		temp^=temp1;
+		temp1=std::rotr(c,41);
+		s1temp^=s1temp2;
+		temp^=temp1;
+		T1k+=temp;
+		s1temp2=s2low>>61;
+		temp=d^e;
+		temp&=c;
+		s1temp^=s1temp2;
+		temp^=e;
+		T1k+=temp;
+		s1temp2=s2low<<3;
+		temp=std::rotr(g,28);
+		temp1=std::rotr(g,34);
+		s1temp^=s1temp2;
+		temp^=temp1;
+		temp1=std::rotr(g,39);
+		s1temp2=s2low>>6;
+		f=temp^temp1;
+		apb=g^h;
+		s3=s1temp^s1temp2;
+		temp=apb&bpc;
+		bpc=apb;
+		s1.wrap_add_assign(s3);
+		temp^=h;
+		f+=temp;
+		s2.value=__builtin_shufflevector(s2low.value,s1.value,0,1,2,3);
+		bpc=apb;
+		b+=T1k;
+		s2.store(w+round);
+		f+=T1k;
+		T1k=e;
+		s0.load(::fast_io::details::sha512::K512+round);
+		T1k+=wt[3];
+		temp=std::rotr(b,14);
+		s0.wrap_add_assign(s2);
+		temp1=std::rotr(b,18);
+		temp^=temp1;
+		s0.store(wt1);
+		temp1=std::rotr(b,41);
+		temp^=temp1;
+		T1k+=temp;
+		temp=c^d;
+		temp&=b;
+		temp^=d;
+		T1k+=temp;
+		temp=std::rotr(f,28);
+		temp1=std::rotr(f,34);
+		temp^=temp1;
+		temp1=std::rotr(f,39);
+		e=temp^temp1;
+		apb=f^g;
+		temp=apb&bpc;
+		bpc=apb;
+		temp^=g;
+		e+=temp;
+		bpc=apb;
+		a+=T1k;
+		e+=T1k;
+	}
+}
+
+#if __has_cpp_attribute(__gnu__::__flatten__)
+[[__gnu__::__flatten__]]
+#endif
+inline void sha512_runtime_routine(std::uint_least64_t* __restrict state,std::byte const* __restrict blocks_start,std::byte const* __restrict blocks_last) noexcept
+{
+	using namespace fast_io::intrinsics;
+	using namespace fast_io::details::sha512;
+
+	simd_vector<std::uint_least64_t,4> simd;
+
+	std::uint_least64_t wt0[4],wt1[4];
+	std::uint_least64_t w[80];
+	std::uint_least64_t a{state[0]};
+	std::uint_least64_t b{state[1]};
+	std::uint_least64_t c{state[2]};
+	std::uint_least64_t d{state[3]};
+	std::uint_least64_t e{state[4]};
+	std::uint_least64_t f{state[5]};
+	std::uint_least64_t g{state[6]};
+	std::uint_least64_t h{state[7]};
+
+	for(;blocks_start!=blocks_last;blocks_start+=128)
+	{
+		sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt0,0);
+		sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt1,4);
+		std::uint_least64_t bpc{b^c};
+		sha512_scalar_round(wt0[0],a,b,d,e,f,g,h,bpc);
+		sha512_scalar_round(wt0[1],h,a,c,d,e,f,g,bpc);
+		sha512_scalar_round(wt0[2],g,h,b,c,d,e,f,bpc);
+		sha512_scalar_round(wt0[3],f,g,a,b,c,d,e,bpc);
+		sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt0,8);
+		sha512_scalar_round(wt1[0],e,f,h,a,b,c,d,bpc);
+		sha512_scalar_round(wt1[1],d,e,g,h,a,b,c,bpc);
+		sha512_scalar_round(wt1[2],c,d,f,g,h,a,b,bpc);
+		sha512_scalar_round(wt1[3],b,c,e,f,g,h,a,bpc);
+		sha512_simd32_byte_swap_message_4rounds(simd,blocks_start,w,wt1,12);
+		sha512_scalar_round(wt0[0],a,b,d,e,f,g,h,bpc);
+		sha512_scalar_round(wt0[1],h,a,c,d,e,f,g,bpc);
+		sha512_scalar_round(wt0[2],g,h,b,c,d,e,f,bpc);
+		sha512_scalar_round(wt0[3],f,g,a,b,c,d,e,bpc);
+		simd_vector<std::uint_least64_t,2> simd2{__builtin_shufflevector(simd.value,simd.value,2,3)};
+		for(std::uint_fast8_t i{16};i!=80;i+=16)
+		{
+			sha512_simd32_compute_message_4rounds(simd2,w,wt1,wt0,i,
+			e,f,g,h,a,b,c,d,bpc);
+			sha512_simd32_compute_message_4rounds(simd2,w,wt0,wt1,i+4,
+			a,b,c,d,e,f,g,h,bpc);
+			sha512_simd32_compute_message_4rounds(simd2,w,wt1,wt0,i+8,
+			e,f,g,h,a,b,c,d,bpc);
+			sha512_simd32_compute_message_4rounds(simd2,w,wt0,wt1,i+12,
+			a,b,c,d,e,f,g,h,bpc);
+		}
+		sha512_scalar_round(wt1[0],e,f,h,a,b,c,d,bpc);
+		sha512_scalar_round(wt1[1],d,e,g,h,a,b,c,bpc);
+		sha512_scalar_round(wt1[2],c,d,f,g,h,a,b,bpc);
+		sha512_scalar_round(wt1[3],b,c,e,f,g,h,a,bpc);
+
+		a=(*state+=a);
+		b=(state[1]+=b);
+		c=(state[2]+=c);
+		d=(state[3]+=d);
+		e=(state[4]+=e);
+		f=(state[5]+=f);
+		g=(state[6]+=g);
+		h=(state[7]+=h);
+	}
+}
\ No newline at end of file